This appendix contains all Python modules that are used in Chapter 5. You will also find them in the companion repository, in the folder of that chapter.
The excel.py
module allows you to read and write nested lists to and from Excel ranges. As it harmonizes the input and output across packages, it allows you to easily switch from one package to another with minimal impact on your code: you only need to provide the sheet object of the package that you are using. It supports OpenPyXL, pyxlsb, xlrd, xlwt and XlsxWriter. You may, of course, also use this module as an inspiration to write your own code to read and write cell values. For more context, see “The excel.py Module”.
"""This module offers a read and write function to get
2-dimensional lists in and out of Excel files.
"""
import
itertools
import
datetime
as
dt
# Optional dependencies
try
:
import
openpyxl
except
ImportError
:
openpyxl
=
None
try
:
import
pyxlsb
except
ImportError
:
pyxlsb
=
None
try
:
import
xlrd
from
xlrd.biffh
import
error_text_from_code
except
ImportError
:
xlrd
=
None
try
:
import
xlwt
from
xlwt.Utils
import
cell_to_rowcol2
except
ImportError
:
xlwt
=
None
try
:
import
xlsxwriter
from
xlsxwriter.utility
import
xl_cell_to_rowcol
except
ImportError
:
xlsxwriter
=
None
def
read
(
sheet
,
first_cell
=
'A1'
,
last_cell
=
None
):
"""Read a 2-dimensional list from an Excel range.
Parameters
----------
sheet : object
An xlrd, openpyxl or pyxlsb sheet object
first_cell : str or tuple, optional
Top-left corner of the Excel range you want to read.
Can be a string like 'A1' or a row/col tuple like (1, 1),
default is 'A1'.
last_cell : str or tuple, optional
Bottom-right corner of the Excel range you want to read.
Can be a string like 'A1' or a row/col tuple like (1, 1),
default is the bottom-right cell of the used range.
Returns
-------
list
A 2-dimensional list with the values of the Excel range
"""
# xlrd
if
xlrd
and
isinstance
(
sheet
,
xlrd
.
sheet
.
Sheet
):
# isinstance returns True if sheet is of type xlrd.sheet.Sheet
if
last_cell
is
None
:
# actual range with data, not used range
last_cell
=
(
sheet
.
nrows
,
sheet
.
ncols
)
# Transform 'A1' notation into tuples of 1-based indices
if
not
isinstance
(
first_cell
,
tuple
):
first_cell
=
xlwt
.
Utils
.
cell_to_rowcol2
(
first_cell
)
first_cell
=
(
first_cell
[
0
]
+
1
,
first_cell
[
1
]
+
1
)
if
not
isinstance
(
last_cell
,
tuple
):
last_cell
=
xlwt
.
Utils
.
cell_to_rowcol2
(
last_cell
)
last_cell
=
(
last_cell
[
0
]
+
1
,
last_cell
[
1
]
+
1
)
values
=
[]
for
r
in
range
(
first_cell
[
0
]
-
1
,
last_cell
[
0
]):
row
=
[]
for
c
in
range
(
first_cell
[
1
]
-
1
,
last_cell
[
1
]):
# Handle the different cell types
if
sheet
.
cell
(
r
,
c
)
.
ctype
==
xlrd
.
XL_CELL_DATE
:
value
=
xlrd
.
xldate
.
xldate_as_datetime
(
sheet
.
cell
(
r
,
c
)
.
value
,
sheet
.
book
.
datemode
)
elif
sheet
.
cell
(
r
,
c
)
.
ctype
in
[
xlrd
.
XL_CELL_EMPTY
,
xlrd
.
XL_CELL_BLANK
]:
value
=
None
elif
sheet
.
cell
(
r
,
c
)
.
ctype
==
xlrd
.
XL_CELL_ERROR
:
value
=
error_text_from_code
[
sheet
.
cell
(
r
,
c
)
.
value
]
elif
sheet
.
cell
(
r
,
c
)
.
ctype
==
xlrd
.
XL_CELL_BOOLEAN
:
value
=
bool
(
sheet
.
cell
(
r
,
c
)
.
value
)
else
:
value
=
sheet
.
cell
(
r
,
c
)
.
value
row
.
append
(
value
)
values
.
append
(
row
)
return
values
# OpenPyXL
elif
openpyxl
and
isinstance
(
sheet
,
(
openpyxl
.
worksheet
.
worksheet
.
Worksheet
,
openpyxl
.
worksheet
.
_read_only
.
ReadOnlyWorksheet
)):
if
last_cell
is
None
:
# used range
last_cell
=
(
sheet
.
max_row
,
sheet
.
max_column
)
if
not
isinstance
(
first_cell
,
tuple
):
first_cell
=
openpyxl
.
utils
.
cell
.
coordinate_to_tuple
(
first_cell
)
if
not
isinstance
(
last_cell
,
tuple
):
last_cell
=
openpyxl
.
utils
.
cell
.
coordinate_to_tuple
(
last_cell
)
data
=
[]
for
row
in
sheet
.
iter_rows
(
min_row
=
first_cell
[
0
],
min_col
=
first_cell
[
1
],
max_row
=
last_cell
[
0
],
max_col
=
last_cell
[
1
],
values_only
=
True
):
data
.
append
(
list
(
row
))
return
data
# pyxlsb
elif
pyxlsb
and
isinstance
(
sheet
,
pyxlsb
.
worksheet
.
Worksheet
):
errors
=
{
'0x0'
:
'#NULL!'
,
'0x7'
:
'#DIV/0!'
,
'0xf'
:
'#VALUE!'
,
'0x17'
:
'#REF!'
,
'0x1d'
:
'#NAME?'
,
'0x24'
:
'#NUM!'
,
'0x2a'
:
'#N/A'
}
if
not
isinstance
(
first_cell
,
tuple
):
first_cell
=
openpyxl
.
utils
.
coordinate_to_tuple
(
first_cell
)
if
last_cell
and
not
isinstance
(
last_cell
,
tuple
):
last_cell
=
openpyxl
.
utils
.
coordinate_to_tuple
(
last_cell
)
data
=
[]
# sheet.rows() is a generator that requires islice to slice it
for
row
in
itertools
.
islice
(
sheet
.
rows
(),
first_cell
[
0
]
-
1
,
last_cell
[
0
]
if
last_cell
else
None
):
data
.
append
([
errors
.
get
(
cell
.
v
,
cell
.
v
)
for
cell
in
row
]
[
first_cell
[
1
]
-
1
:
last_cell
[
1
]
if
last_cell
else
None
])
return
data
else
:
raise
TypeError
(
f
"Couldn't handle sheet of type {type(sheet)}"
)
def
write
(
sheet
,
values
,
first_cell
=
'A1'
,
date_format
=
None
):
"""Write a 2-dimensional list to an Excel range.
Parameters
----------
sheet : object
An openpyxl, xlsxwriter or xlwt sheet object. openpyxl's
write_only=True mode is not supported.
values : list
A 2-dimensional list of values
first_cell : str or tuple, optional
Top-left corner of the Excel range where you want to write out
the DataFrame. Can be a string like 'A1' or a row/col tuple
like (1, 1), default is 'A1'.
date_format : str, optional
Only accepted if sheet is an openppyxl or xlwt sheet. By default,
formats dates in the following format: 'mm/dd/yy'. For xlsxwriter,
set the format when you instantiate a Workbook by providing:
options={'default_date_format': 'mm/dd/yy'}
"""
# OpenPyXL
if
openpyxl
and
isinstance
(
sheet
,
(
openpyxl
.
worksheet
.
worksheet
.
Worksheet
)):
if
date_format
is
None
:
date_format
=
'mm/dd/yy'
if
not
isinstance
(
first_cell
,
tuple
):
first_cell
=
openpyxl
.
utils
.
coordinate_to_tuple
(
first_cell
)
for
i
,
row
in
enumerate
(
values
):
for
j
,
value
in
enumerate
(
row
):
cell
=
sheet
.
cell
(
row
=
first_cell
[
0
]
+
i
,
column
=
first_cell
[
1
]
+
j
)
cell
.
value
=
value
if
date_format
and
isinstance
(
value
,
(
dt
.
datetime
,
dt
.
date
)):
cell
.
number_format
=
date_format
# XlsxWriter
elif
xlsxwriter
and
isinstance
(
sheet
,
xlsxwriter
.
worksheet
.
Worksheet
):
if
date_format
is
not
None
:
raise
ValueError
(
'date_format must be set as Workbook option'
)
if
isinstance
(
first_cell
,
tuple
):
first_cell
=
first_cell
[
0
]
-
1
,
first_cell
[
1
]
-
1
else
:
first_cell
=
xl_cell_to_rowcol
(
first_cell
)
for
r
,
row_data
in
enumerate
(
values
):
sheet
.
write_row
(
first_cell
[
0
]
+
r
,
first_cell
[
1
],
row_data
)
# xlwt
elif
xlwt
and
isinstance
(
sheet
,
xlwt
.
Worksheet
):
if
date_format
is
None
:
date_format
=
'mm/dd/yy'
date_format
=
xlwt
.
easyxf
(
num_format_str
=
date_format
)
if
isinstance
(
first_cell
,
tuple
):
first_cell
=
(
first_cell
[
0
]
-
1
,
first_cell
[
1
]
-
1
)
else
:
first_cell
=
xlwt
.
Utils
.
cell_to_rowcol2
(
first_cell
)
for
i
,
row
in
enumerate
(
values
):
for
j
,
cell
in
enumerate
(
row
):
if
isinstance
(
cell
,
(
dt
.
datetime
,
dt
.
date
)):
sheet
.
write
(
i
+
first_cell
[
0
],
j
+
first_cell
[
1
],
cell
,
date_format
)
else
:
sheet
.
write
(
i
+
first_cell
[
0
],
j
+
first_cell
[
1
],
cell
)
else
:
raise
TypeError
(
f
"Couldn't handle sheet of type {type(sheet)}"
)
This section shows three modules that allow you to read the sheets of big Excel files in parallel. The modules make use of the multiprocessing
package that is part of Python’s standard library and parallelize the reading process for the following packages: pandas, OpenPyXL and xlrd. The first two modules only work with the xlsx
format, while the last module only works with the xls
format. To see the first module in action, see “Reading sheets in parallel”.
import
multiprocessing
from
itertools
import
repeat
import
pandas
as
pd
import
openpyxl
def
_read_sheet
(
filename
,
sheet_name
):
# The leading underscore in the function name is used by convention
# to mark it as "private", i.e. it shoulddn't be used directly outside
# of this module.
df
=
pd
.
read_excel
(
filename
,
sheet_name
=
sheet_name
,
engine
=
'openpyxl'
)
return
sheet_name
,
df
def
read_excel
(
filename
,
sheet_name
=
None
):
if
sheet_name
is
None
:
book
=
openpyxl
.
load_workbook
(
filename
,
read_only
=
True
,
data_only
=
True
)
sheet_name
=
book
.
sheetnames
book
.
close
()
with
multiprocessing
.
Pool
()
as
pool
:
# By default, Pool spawns as many processes as there are CPU cores.
# startmap maps a tuple of arguments to a function. The zip expression
# produces a list with tuples of the following form:
# [('filename.xlsx', 'Sheet1'), ('filename.xlsx', 'Sheet2)]
data
=
pool
.
starmap
(
_read_sheet
,
zip
(
repeat
(
filename
),
sheet_name
))
return
{
i
[
0
]:
i
[
1
]
for
i
in
data
}
import
multiprocessing
from
itertools
import
repeat
import
openpyxl
import
excel
def
_read_sheet
(
filename
,
sheetname
):
book
=
openpyxl
.
load_workbook
(
filename
,
read_only
=
True
,
data_only
=
True
)
sheet
=
book
[
sheetname
]
data
=
excel
.
read
(
sheet
)
book
.
close
()
return
sheet
.
title
,
data
def
load_workbook
(
filename
,
sheetnames
=
None
):
if
sheetnames
is
None
:
book
=
openpyxl
.
load_workbook
(
filename
,
read_only
=
True
,
data_only
=
True
)
sheetnames
=
book
.
sheetnames
book
.
close
()
with
multiprocessing
.
Pool
()
as
pool
:
data
=
pool
.
starmap
(
_read_sheet
,
zip
(
repeat
(
filename
),
sheetnames
))
return
{
i
[
0
]:
i
[
1
]
for
i
in
data
}
import
multiprocessing
from
itertools
import
repeat
import
xlrd
import
excel
def
_read_sheet
(
filename
,
sheetname
):
with
xlrd
.
open_workbook
(
filename
,
on_demand
=
True
)
as
book
:
sheet
=
book
.
sheet_by_name
(
sheetname
)
data
=
excel
.
read
(
sheet
)
return
sheet
.
name
,
data
def
open_workbook
(
filename
,
sheetnames
=
None
):
if
sheetnames
is
None
:
with
xlrd
.
open_workbook
(
filename
,
on_demand
=
True
)
as
book
:
sheetnames
=
book
.
sheet_names
()
with
multiprocessing
.
Pool
()
as
pool
:
data
=
pool
.
starmap
(
_read_sheet
,
zip
(
repeat
(
filename
),
sheetnames
))
return
{
i
[
0
]:
i
[
1
]
for
i
in
data
}
This section contains two versions of a script that produces the Excel report as shown in Figure 5-8. The first version is using pandas with OpenPyXL while the second version is using pandas with XlsxWriter.
from
pathlib
import
Path
import
pandas
as
pd
from
openpyxl.styles
import
Font
,
Alignment
from
openpyxl.formatting.rule
import
CellIsRule
from
openpyxl.chart
import
BarChart
,
Reference
from
openpyxl.chart.shapes
import
GraphicalProperties
from
openpyxl.drawing.line
import
LineProperties
# Directory of this file
this_dir
=
Path
(
__file__
)
.
parent
# Read in all files
parts
=
[]
for
path
in
(
this_dir
/
'sales_data'
)
.
rglob
(
'*.xls*'
):
part
=
pd
.
read_excel
(
path
)
parts
.
append
(
part
)
# Combine the DataFrames from each file into a single DataFrame
df
=
pd
.
concat
(
parts
)
# Pivot each store into a column and sum up all transaction per date
pivot
=
pd
.
pivot_table
(
df
,
index
=
'transaction_date'
,
columns
=
'store'
,
values
=
'amount'
,
aggfunc
=
'sum'
)
# Resample to end of month and assign an index name
summary
=
pivot
.
resample
(
'M'
)
.
sum
()
summary
.
index
.
name
=
'Month'
# Sort by total revenues per store
summary
=
summary
.
loc
[:,
summary
.
sum
()
.
sort_values
()
.
index
]
# Add row and column totals: Using 'append' together with 'rename'
# is a convenient way to add a row to the bottom of a DataFrame
summary
.
loc
[:,
'Total'
]
=
summary
.
sum
(
axis
=
1
)
summary
=
summary
.
append
(
summary
.
sum
(
axis
=
0
)
.
rename
(
'Total'
))
# DataFrame position and number of rows/columns
# openpxyl uses 1-based indices
startrow
=
3
startcol
=
2
nrows
,
ncols
=
summary
.
shape
with
pd
.
ExcelWriter
(
this_dir
/
'sales_report_openpyxl.xlsx'
,
engine
=
'openpyxl'
,
write_only
=
True
)
as
writer
:
# pandas uses 0-based indices
summary
.
to_excel
(
writer
,
sheet_name
=
'Sheet1'
,
startrow
=
startrow
-
1
,
startcol
=
startcol
-
1
)
# Get openpyxl book and sheet object
book
=
writer
.
book
sheet
=
writer
.
sheets
[
'Sheet1'
]
# Set title
sheet
.
cell
(
row
=
1
,
column
=
startcol
,
value
=
'Sales Report'
)
sheet
.
cell
(
row
=
1
,
column
=
startcol
)
.
font
=
Font
(
size
=
24
,
bold
=
True
)
# Sheet formatting
sheet
.
sheet_view
.
showGridLines
=
False
# Format the DataFrame with
# - number format
# - column width
# - conditional formatting
for
row
in
range
(
startrow
+
1
,
startrow
+
nrows
+
1
):
for
col
in
range
(
startcol
+
1
,
startcol
+
ncols
+
1
):
cell
=
sheet
.
cell
(
row
=
row
,
column
=
col
)
cell
.
number_format
=
'#,##0'
cell
.
alignment
=
Alignment
(
horizontal
=
'center'
)
for
cell
in
sheet
[
'B'
]:
cell
.
number_format
=
'mmm yy'
for
col
in
range
(
startcol
,
startcol
+
ncols
+
1
):
cell
=
sheet
.
cell
(
row
=
startrow
,
column
=
col
)
sheet
.
column_dimensions
[
cell
.
column_letter
]
.
width
=
14
first_cell
=
sheet
.
cell
(
row
=
startrow
+
1
,
column
=
startcol
+
1
)
last_cell
=
sheet
.
cell
(
row
=
startrow
+
nrows
,
column
=
startcol
+
ncols
)
range_address
=
f
'{first_cell.coordinate}:{last_cell.coordinate}'
sheet
.
conditional_formatting
.
add
(
range_address
,
CellIsRule
(
operator
=
'lessThan'
,
formula
=
[
'20000'
],
stopIfTrue
=
True
,
font
=
Font
(
color
=
'E93423'
)))
# Chart
chart
=
BarChart
()
chart
.
type
=
"col"
chart
.
title
=
"Sales per Month and Store"
chart
.
height
=
11.5
chart
.
width
=
20.5
# Add each column as a series, ignoring total row and col
data
=
Reference
(
sheet
,
min_col
=
startcol
+
1
,
min_row
=
startrow
,
max_row
=
startrow
+
nrows
-
1
,
max_col
=
startcol
+
ncols
-
1
)
categories
=
Reference
(
sheet
,
min_col
=
startcol
,
min_row
=
startrow
+
1
,
max_row
=
startrow
+
nrows
-
1
)
chart
.
add_data
(
data
,
titles_from_data
=
True
)
chart
.
set_categories
(
categories
)
cell
=
sheet
.
cell
(
row
=
startrow
+
nrows
+
2
,
column
=
startcol
)
sheet
.
add_chart
(
chart
=
chart
,
anchor
=
cell
.
coordinate
)
# Chart formatting
chart
.
y_axis
.
title
=
'Sales'
chart
.
x_axis
.
title
=
summary
.
index
.
name
# Hide y-axis line: spPR stands for ShapeProperties
chart
.
y_axis
.
spPr
=
GraphicalProperties
(
ln
=
LineProperties
(
noFill
=
True
))
from
pathlib
import
Path
import
pandas
as
pd
# Directory of this file
this_dir
=
Path
(
__file__
)
.
parent
# Read in all files
parts
=
[]
for
path
in
(
this_dir
/
'sales_data'
)
.
rglob
(
'*.xls*'
):
part
=
pd
.
read_excel
(
path
)
parts
.
append
(
part
)
# Combine the DataFrames from each file into a single DataFrame
df
=
pd
.
concat
(
parts
)
# Pivot each store into a column and sum up all transaction per date
pivot
=
pd
.
pivot_table
(
df
,
index
=
'transaction_date'
,
columns
=
'store'
,
values
=
'amount'
,
aggfunc
=
'sum'
)
# Resample to end of month and assign an index name
summary
=
pivot
.
resample
(
'M'
)
.
sum
()
summary
.
index
.
name
=
'Month'
# Sort by total revenues per store
summary
=
summary
.
loc
[:,
summary
.
sum
()
.
sort_values
()
.
index
]
# Add row and column totals: Using 'append' together with 'rename'
# is a convenient way to add a row to the bottom of a DataFrame
summary
.
loc
[:,
'Total'
]
=
summary
.
sum
(
axis
=
1
)
summary
=
summary
.
append
(
summary
.
sum
(
axis
=
0
)
.
rename
(
'Total'
))
# DataFrame position and number of rows/columns
# xlsxwriter uses 0-based indices
startrow
=
2
startcol
=
1
nrows
,
ncols
=
summary
.
shape
with
pd
.
ExcelWriter
(
this_dir
/
'sales_report_xlsxwriter.xlsx'
,
engine
=
'xlsxwriter'
,
datetime_format
=
'mmm yy'
)
as
writer
:
summary
.
to_excel
(
writer
,
sheet_name
=
'Sheet1'
,
startrow
=
startrow
,
startcol
=
startcol
)
# Get xlsxwriter book and sheet object
book
=
writer
.
book
sheet
=
writer
.
sheets
[
'Sheet1'
]
# Set title
title_format
=
book
.
add_format
({
'bold'
:
True
,
'size'
:
24
})
sheet
.
write
(
0
,
startcol
,
'Sales Report'
,
title_format
)
# Sheet formatting
# 2: hides on screen and when printing
sheet
.
hide_gridlines
(
2
)
# Format the DataFrame with
# - number format
# - column width
# - conditional formatting
number_format
=
book
.
add_format
({
'num_format'
:
'#,##0'
,
'align'
:
'center'
})
below_target_format
=
book
.
add_format
({
'font_color'
:
'#E93423'
})
sheet
.
set_column
(
first_col
=
startcol
,
last_col
=
startcol
+
ncols
,
width
=
14
,
cell_format
=
number_format
)
sheet
.
conditional_format
(
first_row
=
startrow
+
1
,
first_col
=
startcol
+
1
,
last_row
=
startrow
+
nrows
,
last_col
=
startcol
+
ncols
,
options
=
{
'type'
:
'cell'
,
'criteria'
:
'<='
,
'value'
:
20000
,
'format'
:
below_target_format
})
# Chart
chart
=
book
.
add_chart
({
'type'
:
'column'
})
chart
.
set_title
({
'name'
:
'Sales per Month and Store'
})
chart
.
set_size
({
'width'
:
830
,
'height'
:
450
})
# Add each column as a series, ignoring total row and col
for
col
in
range
(
1
,
ncols
):
chart
.
add_series
({
# [sheetname, first_row, first_col, last_row, last_col]
'name'
:
[
'Sheet1'
,
startrow
,
startcol
+
col
],
'categories'
:
[
'Sheet1'
,
startrow
+
1
,
startcol
,
startrow
+
nrows
-
1
,
startcol
],
'values'
:
[
'Sheet1'
,
startrow
+
1
,
startcol
+
col
,
startrow
+
nrows
-
1
,
startcol
+
col
],
})
# Chart formatting
chart
.
set_x_axis
({
'name'
:
summary
.
index
.
name
,
'major_tick_mark'
:
'none'
})
chart
.
set_y_axis
({
'name'
:
'Sales'
,
'line'
:
{
'none'
:
True
},
'major_gridlines'
:
{
'visible'
:
True
},
'major_tick_mark'
:
'none'
})
# Add the chart to the sheet
sheet
.
insert_chart
(
startrow
+
nrows
+
2
,
startcol
,
chart
)
3.137.174.216