I am trying to extract specific information from every PDF file in a folder into a single CSV file. Each PDF has the information across multiple pages. However something is wrong with my loop or how it is implemented and I am not quite sure why. The output CSV contains all six headers, but only has information for two of the columns. The information in these two columns is found in the first page of each pdf, whereas the other four columns whos information is missing is found entirely after the first page. For some reason I can not get it to loop through each page of each pdf.
Edit: the regexes look off (identical) because I had to anonymize them before posting as I could not post the actual text used here.
I have tried a few different iterations of the following code with no change in output:
import os
import csv
import re
import fitz
# Define regular expressions patterns to match the desired information.
I_T_Pattern = r'Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4}) \(Lorem ipsum dolor sit amet\)'
R_P_pattern = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
R_T_pattern = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
NON_R_pattern = r'Lorem ipsum dolor sit amet (\d+)'
E_D_pattern = r'Lorem ipsum dolor sit amet \(“Lorem ipsum dolor sit amet”\), Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4})'
L_pattern = r'“Lorem ipsum dolor sit amet”\)\.(\s+)(\w+[\s\w]*),'
def extract_contract_data(pdf_file):
# Open the PDF file.
doc = fitz.open(pdf_file)
# Initialize variables to store extracted information.
I_T= None
R_P= None
R_T= None
NON_R= None
E_D= None
L = None
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
# Extract the text from the current page.
text = page.get_text()
# Use regular expressions to find the desired information.
I_T_match= re.search(I_T_Pattern, text)
R_P_match= re.search(R_P_pattern, text)
R_T_match= re.search(R_T_pattern , text)
NON_R_match = re.search(NON_R_pattern , text)
E_D_match = re.search(E_D_pattern, text)
L_match = re.search(L_pattern, text)
# If the information is found and not already extracted, store it in the corresponding variable.
if I_T_matchand not I_T:
I_T= I_T_match.group(1)
if R_P_matchand not R_P:
R_P= R_P_match.group(1)
if R_T_matchand not R_T:
R_T= R_T_match.group(1)
if NON_R_match and not NON_R:
NON_R= NON_R_match.group(1)
if E_D_match and not E_D:
E_D= E_D_match .group(1)
if L_match and not L:
L = L_match.group(2)
# Close the PDF file.
doc.close()
return {
"L": L,
"E D": E_D,
"I T": I_T,
"R P": R_P,
"R T": R_T,
"NON R": NON_R
}
def main():
# Get the path to the directory that contains the PDFs.
pdf_dir = r'C:\\path'
# Create a list of all the PDF files in the directory.
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
# Create an empty list to store the data found in the PDFs.
data = []
# Iterate over the PDF files.
for pdf_file in pdf_files:
# Extract contract data from each PDF.
contract_data = extract_contract_data(pdf_file)
data.append(contract_data)
# Define the output file path.
output_file = r'C:\\path'
with open(output_file, "w", newline='') as file:
fieldnames = ["L", "E D", "I T", "R P", "R T", "NON R"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
print(f"output: {output_file}")
# Call the main function to run the code.
main()