2

So, I am using tabula to scrub a ton of pdf reports. For anonimity sake lets assume these reports are about shoes.

-I have a root folder where each shoe report has a folder named SHR-some random number.

----Inside there will be a pdf file that is the actual report.

---------This report contains tables, some of which I must extract and compile into one big summary table and write to a csv file.

I put together the below script that just does the job thanks to stack overflow. I am not a software developer by profession. Currently it takes a little over one minute to process one pdf file.

How can I make it go faster (if possible). I have written a comment for what each line is doing.

import tabula as tb
import pandas as pd
from pathlib import Path
import csv
import glob
import os
from datetime import datetime
import numpy as np

dt=datetime.now() #get current timestamp
timestamp=dt.strftime("%d-%b-%Y (%H-%M)") #format the timestamp string
resultfolder=r"C:/Users/xxxxxxxxxxxxxxxxxxxxxxxxxx/Analysis_results" #this is the root directory to store all result folders and files

csvpath= resultfolder + '/' + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') #define a path for a new folder to be created with timestamp in foldername, everytime this script runs
os.makedirs(csvpath) #make the new folder
path = Path(csvpath) #make the new result subfolder path
fpath= (path / timestamp ).with_suffix('.csv') #make a filepath for a csv file with timestamp in its name
f_public = r"//networkdrive/folder/folder1/folder2" #this is the root folder for all my pdf reports
os.chdir(f_public) #change to the target pdf reports root folder
dbase=pd.DataFrame() # I use this dataframe to store the extracted rows
dbase= dbase.assign(ColA=['Shoe manufacturer','Shoe name','Shoe type','Shoe type','Shoe size','Date code', 'ShoeCode', 'ShoeLink']) # define the first column and name it ColA
unreadcount=0 #counter for pdf files throwing exception while reading
unreadlist=[] #list of unreadable pdf files
ShoeCode=[] #list of current shoe reportcode
ShoeFileName=[] # list of current shoe report file name

for bfr in glob.glob('SHR-*'): # go through folders sarting SHR- in this root folder since all shoe reports are named SHR-something
    try:
        ShoeCode.append(bfr) #append the shoe report code to list
        pdfpath = f_public + '/'+ bfr # figure out path to the folder of this shoe report - which will have actual shoe report pdf file inside 
        os.chdir(pdfpath) #go inside report folder
        file = [f for  f in os.listdir('.') if os.path.isfile(f) and f.endswith('.pdf')] #find the file with pdf extension int his folder
        ShoeFileName.append(file[0]) # add file nme to list
        pdffile=pdfpath + '/'+ file[0] # figure out pdf file path
        tables = tb.read_pdf(pdffile, pages="all", pandas_options={'header': None}) # extract all tables in pdf file as a list of dataframes
        for tbl in tables:
            try:
                if(('Shoe name' in tbl.values) and ('Package size' in tbl.values)): #sometimes there are tables with just shoename so I want to make sure the table has both these fields
                    tbl.columns= ['ColA', 'ColB'] # the tables will always have just two columns. I name them ColA and ColB
                    tbl.loc[tbl.index.max() + 1] = ['ShoeCode',  bfr] #ShoeCode is not mentioned in the tables in the report - I add it to dataframe from the folder name
                    tbl.loc[tbl.index.max() + 1] = ['ShoeLink',  pdffile] #sameway i store the filepath 
                    dbase = dbase.merge(tbl,how='left', on='ColA') # once the right table is found, merge it to the dbase dataframe
                    print(bfr) #these are only to see whats happening during development - I will remove later
                    print('\n')
                    print('Final BFR Datbase: \n', dbase)
            except:
                continue
    except:
        print('Couldnt read ', bfr,'/', file[0], '\n') #if tabula reading pdf fails - take note, print message and continue
        unreadcount+=1
        unreadlist.append(bfr + '/' + file[0])
        continue
dbase=dbase.transpose() #rotate the dataframe so that all the values are now rows
dbase.to_csv(fpath, index=False, header = False) #write this to a csv file
spoikayi
  • 55
  • 7
  • 6
    A good first step would be to run a profiler on your code (https://docs.python.org/3/library/profile.html) to find out which lines are taking the longest – C_Z_ Jul 25 '22 at 17:37

0 Answers0