So, I am using tabula to scrub a ton of pdf reports. For anonimity sake lets assume these reports are about shoes.
-I have a root folder where each shoe report has a folder named SHR-some random number.
----Inside there will be a pdf file that is the actual report.
---------This report contains tables, some of which I must extract and compile into one big summary table and write to a csv file.
I put together the below script that just does the job thanks to stack overflow. I am not a software developer by profession. Currently it takes a little over one minute to process one pdf file.
How can I make it go faster (if possible). I have written a comment for what each line is doing.
import tabula as tb
import pandas as pd
from pathlib import Path
import csv
import glob
import os
from datetime import datetime
import numpy as np
dt=datetime.now() #get current timestamp
timestamp=dt.strftime("%d-%b-%Y (%H-%M)") #format the timestamp string
resultfolder=r"C:/Users/xxxxxxxxxxxxxxxxxxxxxxxxxx/Analysis_results" #this is the root directory to store all result folders and files
csvpath= resultfolder + '/' + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') #define a path for a new folder to be created with timestamp in foldername, everytime this script runs
os.makedirs(csvpath) #make the new folder
path = Path(csvpath) #make the new result subfolder path
fpath= (path / timestamp ).with_suffix('.csv') #make a filepath for a csv file with timestamp in its name
f_public = r"//networkdrive/folder/folder1/folder2" #this is the root folder for all my pdf reports
os.chdir(f_public) #change to the target pdf reports root folder
dbase=pd.DataFrame() # I use this dataframe to store the extracted rows
dbase= dbase.assign(ColA=['Shoe manufacturer','Shoe name','Shoe type','Shoe type','Shoe size','Date code', 'ShoeCode', 'ShoeLink']) # define the first column and name it ColA
unreadcount=0 #counter for pdf files throwing exception while reading
unreadlist=[] #list of unreadable pdf files
ShoeCode=[] #list of current shoe reportcode
ShoeFileName=[] # list of current shoe report file name
for bfr in glob.glob('SHR-*'): # go through folders sarting SHR- in this root folder since all shoe reports are named SHR-something
try:
ShoeCode.append(bfr) #append the shoe report code to list
pdfpath = f_public + '/'+ bfr # figure out path to the folder of this shoe report - which will have actual shoe report pdf file inside
os.chdir(pdfpath) #go inside report folder
file = [f for f in os.listdir('.') if os.path.isfile(f) and f.endswith('.pdf')] #find the file with pdf extension int his folder
ShoeFileName.append(file[0]) # add file nme to list
pdffile=pdfpath + '/'+ file[0] # figure out pdf file path
tables = tb.read_pdf(pdffile, pages="all", pandas_options={'header': None}) # extract all tables in pdf file as a list of dataframes
for tbl in tables:
try:
if(('Shoe name' in tbl.values) and ('Package size' in tbl.values)): #sometimes there are tables with just shoename so I want to make sure the table has both these fields
tbl.columns= ['ColA', 'ColB'] # the tables will always have just two columns. I name them ColA and ColB
tbl.loc[tbl.index.max() + 1] = ['ShoeCode', bfr] #ShoeCode is not mentioned in the tables in the report - I add it to dataframe from the folder name
tbl.loc[tbl.index.max() + 1] = ['ShoeLink', pdffile] #sameway i store the filepath
dbase = dbase.merge(tbl,how='left', on='ColA') # once the right table is found, merge it to the dbase dataframe
print(bfr) #these are only to see whats happening during development - I will remove later
print('\n')
print('Final BFR Datbase: \n', dbase)
except:
continue
except:
print('Couldnt read ', bfr,'/', file[0], '\n') #if tabula reading pdf fails - take note, print message and continue
unreadcount+=1
unreadlist.append(bfr + '/' + file[0])
continue
dbase=dbase.transpose() #rotate the dataframe so that all the values are now rows
dbase.to_csv(fpath, index=False, header = False) #write this to a csv file