Appendix B. Code for Reading and Writing Excel Files

This appendix contains all Python modules that are used in Chapter 5. You will also find them in the companion repository, in the folder of that chapter.

The excel.py Module

The excel.py module allows you to read and write nested lists to and from Excel ranges. As it harmonizes the input and output across packages, it allows you to easily switch from one package to another with minimal impact on your code: you only need to provide the sheet object of the package that you are using. It supports OpenPyXL, pyxlsb, xlrd, xlwt and XlsxWriter. You may, of course, also use this module as an inspiration to write your own code to read and write cell values. For more context, see “The excel.py Module”.

Example B-1. excel.py
"""This module offers a read and write function to get
2-dimensional lists in and out of Excel files.
"""
import itertools
import datetime as dt

# Optional dependencies
try:
    import openpyxl
except ImportError:
    openpyxl = None
try:
    import pyxlsb
except ImportError:
    pyxlsb = None
try:
    import xlrd
    from xlrd.biffh import error_text_from_code
except ImportError:
    xlrd = None
try:
    import xlwt
    from xlwt.Utils import cell_to_rowcol2
except ImportError:
    xlwt = None
try:
    import xlsxwriter
    from xlsxwriter.utility import xl_cell_to_rowcol
except ImportError:
    xlsxwriter = None


def read(sheet, first_cell='A1', last_cell=None):
    """Read a 2-dimensional list from an Excel range.

    Parameters
    ----------
    sheet : object
        An xlrd, openpyxl or pyxlsb sheet object
    first_cell : str or tuple, optional
        Top-left corner of the Excel range you want to read.
        Can be a string like 'A1' or a row/col tuple like (1, 1),
        default is 'A1'.
    last_cell : str or tuple, optional
        Bottom-right corner of the Excel range you want to read.
        Can be a string like 'A1' or a row/col tuple like (1, 1),
        default is the bottom-right cell of the used range.

    Returns
    -------
    list
        A 2-dimensional list with the values of the Excel range
    """
    # xlrd
    if xlrd and isinstance(sheet, xlrd.sheet.Sheet):
        # isinstance returns True if sheet is of type xlrd.sheet.Sheet
        if last_cell is None:
            # actual range with data, not used range
            last_cell = (sheet.nrows, sheet.ncols)
        # Transform 'A1' notation into tuples of 1-based indices
        if not isinstance(first_cell, tuple):
            first_cell = xlwt.Utils.cell_to_rowcol2(first_cell)
            first_cell = (first_cell[0] + 1, first_cell[1] + 1)
        if not isinstance(last_cell, tuple):
            last_cell = xlwt.Utils.cell_to_rowcol2(last_cell)
            last_cell = (last_cell[0] + 1, last_cell[1] + 1)
        values = []
        for r in range(first_cell[0] - 1, last_cell[0]):
            row = []
            for c in range(first_cell[1] - 1, last_cell[1]):
                # Handle the different cell types
                if sheet.cell(r, c).ctype == xlrd.XL_CELL_DATE:
                    value = xlrd.xldate.xldate_as_datetime(
                        sheet.cell(r, c).value, sheet.book.datemode)
                elif sheet.cell(r, c).ctype in [xlrd.XL_CELL_EMPTY,
                                                xlrd.XL_CELL_BLANK]:
                    value = None
                elif sheet.cell(r, c).ctype == xlrd.XL_CELL_ERROR:
                    value = error_text_from_code[sheet.cell(r, c).value]
                elif sheet.cell(r, c).ctype == xlrd.XL_CELL_BOOLEAN:
                    value = bool(sheet.cell(r, c).value)
                else:
                    value = sheet.cell(r, c).value
                row.append(value)
            values.append(row)
        return values

    # OpenPyXL
    elif openpyxl and isinstance(
            sheet,
            (openpyxl.worksheet.worksheet.Worksheet,
             openpyxl.worksheet._read_only.ReadOnlyWorksheet)):
        if last_cell is None:
            # used range
            last_cell = (sheet.max_row, sheet.max_column)
        if not isinstance(first_cell, tuple):
            first_cell = openpyxl.utils.cell.coordinate_to_tuple(first_cell)
        if not isinstance(last_cell, tuple):
            last_cell = openpyxl.utils.cell.coordinate_to_tuple(last_cell)
        data = []
        for row in sheet.iter_rows(min_row=first_cell[0], min_col=first_cell[1],
                                   max_row=last_cell[0], max_col=last_cell[1],
                                   values_only=True):
            data.append(list(row))
        return data

    # pyxlsb
    elif pyxlsb and isinstance(sheet, pyxlsb.worksheet.Worksheet):
        errors = {'0x0': '#NULL!', '0x7': '#DIV/0!', '0xf': '#VALUE!',
                  '0x17': '#REF!', '0x1d': '#NAME?', '0x24': '#NUM!',
                  '0x2a': '#N/A'}
        if not isinstance(first_cell, tuple):
            first_cell = openpyxl.utils.coordinate_to_tuple(first_cell)
        if last_cell and not isinstance(last_cell, tuple):
            last_cell = openpyxl.utils.coordinate_to_tuple(last_cell)
        data = []
        # sheet.rows() is a generator that requires islice to slice it
        for row in itertools.islice(sheet.rows(),
                                    first_cell[0] - 1,
                                    last_cell[0] if last_cell else None):
            data.append([errors.get(cell.v, cell.v) for cell in row]
                        [first_cell[1] - 1 : last_cell[1] if last_cell else None])
        return data
    else:
        raise TypeError(f"Couldn't handle sheet of type {type(sheet)}")


def write(sheet, values, first_cell='A1', date_format=None):
    """Write a 2-dimensional list to an Excel range.

    Parameters
    ----------
    sheet : object
        An openpyxl, xlsxwriter or xlwt sheet object. openpyxl's
        write_only=True mode is not supported.
    values : list
        A 2-dimensional list of values
    first_cell : str or tuple, optional
        Top-left corner of the Excel range where you want to write out
        the DataFrame. Can be a string like 'A1' or a row/col tuple
        like (1, 1), default is 'A1'.
    date_format : str, optional
        Only accepted if sheet is an openppyxl or xlwt sheet. By default,
        formats dates in the following format: 'mm/dd/yy'. For xlsxwriter,
        set the format when you instantiate a Workbook by providing:
        options={'default_date_format': 'mm/dd/yy'}
    """
    # OpenPyXL
    if openpyxl and isinstance(
            sheet, (openpyxl.worksheet.worksheet.Worksheet)):
        if date_format is None:
                date_format = 'mm/dd/yy'
        if not isinstance(first_cell, tuple):
            first_cell = openpyxl.utils.coordinate_to_tuple(first_cell)
        for i, row in enumerate(values):
            for j, value in enumerate(row):
                cell = sheet.cell(row=first_cell[0] + i,
                                  column=first_cell[1] + j)
                cell.value = value
                if date_format and isinstance(value, (dt.datetime, dt.date)):
                    cell.number_format = date_format

    # XlsxWriter
    elif xlsxwriter and isinstance(sheet, xlsxwriter.worksheet.Worksheet):
        if date_format is not None:
            raise ValueError('date_format must be set as Workbook option')
        if isinstance(first_cell, tuple):
            first_cell = first_cell[0] - 1, first_cell[1] - 1
        else:
            first_cell = xl_cell_to_rowcol(first_cell)
        for r, row_data in enumerate(values):
            sheet.write_row(first_cell[0] + r, first_cell[1], row_data)

    # xlwt
    elif xlwt and isinstance(sheet, xlwt.Worksheet):
        if date_format is None:
            date_format = 'mm/dd/yy'
        date_format = xlwt.easyxf(num_format_str=date_format)
        if isinstance(first_cell, tuple):
            first_cell = (first_cell[0] - 1, first_cell[1] - 1)
        else:
            first_cell = xlwt.Utils.cell_to_rowcol2(first_cell)
        for i, row in enumerate(values):
            for j, cell in enumerate(row):
                if isinstance(cell, (dt.datetime, dt.date)):
                    sheet.write(i + first_cell[0], j + first_cell[1],
                                cell, date_format)
                else:
                    sheet.write(i + first_cell[0], j + first_cell[1],
                                cell)
    else:
        raise TypeError(f"Couldn't handle sheet of type {type(sheet)}")

Reading Excel Sheets in Parallel

This section shows three modules that allow you to read the sheets of big Excel files in parallel. The modules make use of the multiprocessing package that is part of Python’s standard library and parallelize the reading process for the following packages: pandas, OpenPyXL and xlrd. The first two modules only work with the xlsx format, while the last module only works with the xls format. To see the first module in action, see “Reading sheets in parallel”.

Reading Sheets in Parallel with pandas

Example B-2. pandas_parallel.py
import multiprocessing
from itertools import repeat

import pandas as pd
import openpyxl


def _read_sheet(filename, sheet_name):
    # The leading underscore in the function name is used by convention
    # to mark it as "private", i.e. it shoulddn't be used directly outside
    # of this module.
    df = pd.read_excel(filename, sheet_name=sheet_name, engine='openpyxl')
    return sheet_name, df

def read_excel(filename, sheet_name=None):
    if sheet_name is None:
        book = openpyxl.load_workbook(filename,
                                      read_only=True, data_only=True)
        sheet_name = book.sheetnames
        book.close()
    with multiprocessing.Pool() as pool:
        # By default, Pool spawns as many processes as there are CPU cores.
        # startmap maps a tuple of arguments to a function. The zip expression
        # produces a list with tuples of the following form:
        # [('filename.xlsx', 'Sheet1'), ('filename.xlsx', 'Sheet2)]
        data = pool.starmap(_read_sheet, zip(repeat(filename), sheet_name))
    return {i[0]: i[1] for i in data}

Reading Sheets in Parallel with OpenPyXL

Example B-3. openpyxl_parallel.py
import multiprocessing
from itertools import repeat

import openpyxl
import excel


def _read_sheet(filename, sheetname):
    book = openpyxl.load_workbook(filename,
                                  read_only=True, data_only=True)
    sheet = book[sheetname]
    data = excel.read(sheet)
    book.close()
    return sheet.title, data

def load_workbook(filename, sheetnames=None):
    if sheetnames is None:
        book = openpyxl.load_workbook(filename,
                                      read_only=True, data_only=True)
        sheetnames = book.sheetnames
        book.close()
    with multiprocessing.Pool() as pool:
        data = pool.starmap(_read_sheet, zip(repeat(filename), sheetnames))
    return {i[0]: i[1] for i in data}

Reading Sheets in Parallel with xlrd

Example B-4. xlrd_parallel.py
import multiprocessing
from itertools import repeat

import xlrd
import excel


def _read_sheet(filename, sheetname):
    with xlrd.open_workbook(filename, on_demand=True) as book:
        sheet = book.sheet_by_name(sheetname)
        data = excel.read(sheet)
    return sheet.name, data

def open_workbook(filename, sheetnames=None):
    if sheetnames is None:
        with xlrd.open_workbook(filename, on_demand=True) as book:
            sheetnames = book.sheet_names()
    with multiprocessing.Pool() as pool:
        data = pool.starmap(_read_sheet, zip(repeat(filename), sheetnames))
    return {i[0]: i[1] for i in data}

Case Study (Revisited)

This section contains two versions of a script that produces the Excel report as shown in Figure 5-8. The first version is using pandas with OpenPyXL while the second version is using pandas with XlsxWriter.

Excel Reporting with pandas and OpenPyXL

Example B-5. sales_report_openpyxl.py
from pathlib import Path
import pandas as pd
from openpyxl.styles import Font, Alignment
from openpyxl.formatting.rule import CellIsRule
from openpyxl.chart import BarChart, Reference
from openpyxl.chart.shapes import GraphicalProperties
from openpyxl.drawing.line import LineProperties

# Directory of this file
this_dir = Path(__file__).parent

# Read in all files
parts = []
for path in (this_dir / 'sales_data').rglob('*.xls*'):
    part = pd.read_excel(path)
    parts.append(part)

# Combine the DataFrames from each file into a single DataFrame
df = pd.concat(parts)

# Pivot each store into a column and sum up all transaction per date
pivot = pd.pivot_table(df,
                       index='transaction_date', columns='store',
                       values='amount', aggfunc='sum')

# Resample to end of month and assign an index name
summary = pivot.resample('M').sum()
summary.index.name = 'Month'

# Sort by total revenues per store
summary = summary.loc[:, summary.sum().sort_values().index]

# Add row and column totals: Using 'append' together with 'rename'
# is a convenient way to add a row to the bottom of a DataFrame
summary.loc[:, 'Total'] = summary.sum(axis=1)
summary = summary.append(summary.sum(axis=0).rename('Total'))

# DataFrame position and number of rows/columns
# openpxyl uses 1-based indices
startrow = 3
startcol = 2
nrows, ncols = summary.shape

with pd.ExcelWriter(this_dir / 'sales_report_openpyxl.xlsx',
                    engine='openpyxl', write_only=True) as writer:
    # pandas uses 0-based indices
    summary.to_excel(writer, sheet_name='Sheet1',
                     startrow=startrow - 1, startcol=startcol - 1)

    # Get openpyxl book and sheet object
    book = writer.book
    sheet = writer.sheets['Sheet1']

    # Set title
    sheet.cell(row=1, column=startcol, value='Sales Report')
    sheet.cell(row=1, column=startcol).font = Font(size=24, bold=True)

    # Sheet formatting
    sheet.sheet_view.showGridLines = False

    # Format the DataFrame with
    # - number format
    # - column width
    # - conditional formatting
    for row in range(startrow + 1, startrow + nrows + 1):
        for col in range(startcol + 1, startcol + ncols + 1):
            cell = sheet.cell(row=row, column=col)
            cell.number_format = '#,##0'
            cell.alignment = Alignment(horizontal='center')

    for cell in sheet['B']:
        cell.number_format = 'mmm yy'

    for col in range(startcol, startcol + ncols + 1):
        cell = sheet.cell(row=startrow, column=col)
        sheet.column_dimensions[cell.column_letter].width = 14

    first_cell = sheet.cell(row=startrow + 1, column=startcol + 1)
    last_cell = sheet.cell(row=startrow + nrows, column=startcol + ncols)
    range_address = f'{first_cell.coordinate}:{last_cell.coordinate}'
    sheet.conditional_formatting.add(range_address,
                                     CellIsRule(operator='lessThan',
                                                formula=['20000'],
                                                stopIfTrue=True,
                                                font=Font(color='E93423')))

    # Chart
    chart = BarChart()
    chart.type = "col"
    chart.title = "Sales per Month and Store"
    chart.height = 11.5
    chart.width = 20.5

    # Add each column as a series, ignoring total row and col
    data = Reference(sheet, min_col=startcol + 1, min_row=startrow,
                     max_row=startrow + nrows - 1,
                     max_col=startcol + ncols - 1)
    categories = Reference(sheet, min_col=startcol, min_row=startrow + 1,
                           max_row=startrow + nrows - 1)
    chart.add_data(data, titles_from_data=True)
    chart.set_categories(categories)
    cell = sheet.cell(row=startrow + nrows + 2, column=startcol)
    sheet.add_chart(chart=chart, anchor=cell.coordinate)

    # Chart formatting
    chart.y_axis.title = 'Sales'
    chart.x_axis.title = summary.index.name
    # Hide y-axis line: spPR stands for ShapeProperties
    chart.y_axis.spPr = GraphicalProperties(ln=LineProperties(noFill=True))

Excel Reporting with pandas and XlsxWriter

Example B-6. sales_report_xlsxwriter.py
from pathlib import Path
import pandas as pd

# Directory of this file
this_dir = Path(__file__).parent

# Read in all files
parts = []
for path in (this_dir / 'sales_data').rglob('*.xls*'):
    part = pd.read_excel(path)
    parts.append(part)

# Combine the DataFrames from each file into a single DataFrame
df = pd.concat(parts)

# Pivot each store into a column and sum up all transaction per date
pivot = pd.pivot_table(df,
                       index='transaction_date', columns='store',
                       values='amount', aggfunc='sum')

# Resample to end of month and assign an index name
summary = pivot.resample('M').sum()
summary.index.name = 'Month'

# Sort by total revenues per store
summary = summary.loc[:, summary.sum().sort_values().index]

# Add row and column totals: Using 'append' together with 'rename'
# is a convenient way to add a row to the bottom of a DataFrame
summary.loc[:, 'Total'] = summary.sum(axis=1)
summary = summary.append(summary.sum(axis=0).rename('Total'))

# DataFrame position and number of rows/columns
# xlsxwriter uses 0-based indices
startrow = 2
startcol = 1
nrows, ncols = summary.shape

with pd.ExcelWriter(this_dir / 'sales_report_xlsxwriter.xlsx',
                    engine='xlsxwriter', datetime_format='mmm yy') as writer:
    summary.to_excel(writer, sheet_name='Sheet1',
                     startrow=startrow, startcol=startcol)

    # Get xlsxwriter book and sheet object
    book = writer.book
    sheet = writer.sheets['Sheet1']

    # Set title
    title_format = book.add_format({'bold': True, 'size': 24})
    sheet.write(0, startcol, 'Sales Report', title_format)

    # Sheet formatting
    # 2: hides on screen and when printing
    sheet.hide_gridlines(2)

    # Format the DataFrame with
    # - number format
    # - column width
    # - conditional formatting
    number_format = book.add_format({'num_format': '#,##0',
                                     'align': 'center'})
    below_target_format = book.add_format({'font_color': '#E93423'})
    sheet.set_column(first_col=startcol, last_col=startcol + ncols,
                     width=14, cell_format=number_format)
    sheet.conditional_format(first_row=startrow + 1,
                             first_col=startcol + 1,
                             last_row=startrow + nrows,
                             last_col=startcol + ncols,
                             options={'type': 'cell', 'criteria': '<=',
                                      'value': 20000,
                                      'format': below_target_format})

    # Chart
    chart = book.add_chart({'type': 'column'})
    chart.set_title({'name': 'Sales per Month and Store'})
    chart.set_size({'width': 830, 'height': 450})

    # Add each column as a series, ignoring total row and col
    for col in range(1, ncols):
        chart.add_series({
             # [sheetname, first_row, first_col, last_row, last_col]
            'name': ['Sheet1', startrow, startcol + col],
            'categories': ['Sheet1', startrow + 1, startcol,
                           startrow + nrows - 1, startcol],
            'values': ['Sheet1', startrow + 1, startcol + col,
                       startrow + nrows - 1, startcol + col],
        })
    # Chart formatting
    chart.set_x_axis({'name': summary.index.name,
                      'major_tick_mark': 'none'})
    chart.set_y_axis({'name': 'Sales',
                      'line': {'none': True},
                      'major_gridlines': {'visible': True},
                      'major_tick_mark': 'none'})

    # Add the chart to the sheet
    sheet.insert_chart(startrow + nrows + 2, startcol, chart)
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.137.174.216