I'm not sure if you actually need Pandas to do process the data. If not, this solution proposes using OpenPyXL to read the Excel files (workbooks), their sheets, and get values from cells.
To start with, I made this set-up script to try and mock your input data, which meant:
- create one Excel file with two sheets
- each sheet has a header row, then 60 rows by 80 columns of integers (I don't know your actual types)
- the file was then written to a Pro and a Ref master Excel file
- the two master Excel files were copied 35 times each
- the 35 copies for Pro and Ref were zipped up into two master ZIP files
- the master ZIP files were copied 19 times each
I think that's the set up your comment laid out.
import glob
import os
import shutil
from zipfile import ZipFile
from openpyxl import Workbook
# Create local temp dir, and then chdir to it
t_dir = "temp"
try:
shutil.rmtree(t_dir)
except FileNotFoundError:
pass
finally:
os.mkdir(t_dir)
os.chdir(t_dir)
# Create single Excel file
wb = Workbook()
ws1 = wb.active
ws1.title = "Sheet1"
ws2 = wb.create_sheet("Sheet2")
header = [f"Col{x}" for x in range(1, 81)]
ws1.append(header)
ws2.append(header)
row = list(range(1, 81)) # 80 columns
for x in range(1, 61): # by 60 rows
ws1.append([x * y for y in row]) # per sheet
ws2.append([x * y for y in row])
# Save single Excel file to both Pro and Ref master files
pro_xlsx = ".pro.xlsx"
ref_xlsx = ".ref.xlsx"
wb.save(pro_xlsx)
wb.save(ref_xlsx)
# Copy each master file Excel 35 times
for i in range(1, 36):
shutil.copy(pro_xlsx, f"pro_{i:02}.xlsx")
shutil.copy(ref_xlsx, f"ref_{i:02}.xlsx")
# Create master ZIP with each of the 35 Excel files
pro_zip = ".pro.zip"
ref_zip = ".ref.zip"
with ZipFile(pro_zip, "w") as f_zip:
for f_xlsx in glob.glob("pro*.xlsx"):
f_zip.write(f_xlsx)
with ZipFile(ref_zip, "w") as f_zip:
for f_xlsx in glob.glob("ref*.xlsx"):
f_zip.write(f_xlsx)
# Copy each master ZIP file 19 times
for i in range(1, 20):
shutil.copy(pro_zip, f"pro_{i:02}.zip")
shutil.copy(ref_zip, f"ref_{i:02}.zip")
That got me to a point where I could run your code, and get an accurate timing:
import glob
import os
from zipfile import ZipFile
import pandas as pd
os.chdir("temp")
pro_zip_files = sorted(glob.glob("pro*.zip"))
ref_zip_files = sorted(glob.glob("ref*.zip"))
for pro_zip_file, ref_zip_file in zip(pro_zip_files, ref_zip_files):
with (
ZipFile(pro_zip_file) as zip_file_pro,
ZipFile(ref_zip_file) as zip_file_ref,
):
for fn_pro, (member_pro, member_ref) in enumerate(zip(zip_file_pro.namelist(), zip_file_ref.namelist())):
xlfile_pro = zip_file_pro.open(member_pro)
xlfile_ref = zip_file_ref.open(member_ref)
sheet1_pro = pd.read_excel(xlfile_pro, sheet_name="Sheet1")
sheet2_pro = pd.read_excel(xlfile_pro, sheet_name="Sheet2")
sheet1_ref = pd.read_excel(xlfile_ref, sheet_name="Sheet1")
sheet2_ref = pd.read_excel(xlfile_ref, sheet_name="Sheet2")
for sheet in [sheet1_pro, sheet2_pro, sheet1_ref, sheet2_ref]:
assert sheet["Col80"].sum() == 146400
xlfile_pro.close()
xlfile_ref.close()
print(f"done with ZIPs {pro_zip_file} and {ref_zip_file}")
and now with the OpenPyXL alternative:
...
from openpyxl import load_workbook
...
...
...
...
...
pro_wb = load_workbook(xlfile_pro, read_only=True)
ref_wb = load_workbook(xlfile_ref, read_only=True)
sheet1_pro = pro_wb["Sheet1"]
sheet2_pro = pro_wb["Sheet2"]
sheet1_ref = ref_wb["Sheet1"]
sheet2_ref = ref_wb["Sheet2"]
for sheet in [sheet1_pro, sheet2_pro, sheet1_ref, sheet2_ref]:
rows = iter(sheet.rows) # turn generator into iterator, so we can call next(rows)
next(rows) # skip header
summed = sum((row[79].value for row in rows))
assert summed == 146400
pro_wb.close()
ref_wb.close()
xlfile_pro.close()
xlfile_ref.close()
...
As far as I can tell those two do the same trivial process of summing the integer values in the last column.
Here's how they performed on my M1 MacBook Air (w/the 2TB (split channel) SSD):
Process |
Time (s) |
Memory (MB) |
OpenPyXL |
38 |
35 |
Pandas |
51 |
60 |
So, OpenPyXL runs in about 75% of the time as Pandas (and uses a little less memory); OpenPyXL only beats Pandas with the read_only=True option set, otherwise it runs slightly slower than Pandas.
No matter your machine or setup, reading the XLSX file is the bottleneck.
If the data was originally CSV to start with, with two CSVs per "workbook" to account for the two sheets, Pandas runs in 3 secs, and using Python's csv module runs in 0.3 secs.