PyMuPdf extract pdf information into a csv file, from multiple files. Why is this code only extracting data from the first page of each PDF?

Question

I am trying to extract specific information from every PDF file in a folder into a single CSV file. Each PDF has the information across multiple pages. However something is wrong with my loop or how it is implemented and I am not quite sure why. The output CSV contains all six headers, but only has information for two of the columns. The information in these two columns is found in the first page of each pdf, whereas the other four columns whos information is missing is found entirely after the first page. For some reason I can not get it to loop through each page of each pdf.

Edit: the regexes look off (identical) because I had to anonymize them before posting as I could not post the actual text used here.

I have tried a few different iterations of the following code with no change in output:

import os
import csv
import re
import fitz

# Define regular expressions patterns to match the desired information.
I_T_Pattern = r'Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4}) \(Lorem ipsum dolor sit amet\)'
R_P_pattern = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
R_T_pattern  = r'Lorem ipsum dolor sit amet (\d+) Lorem ipsum dolor sit amet'
NON_R_pattern  = r'Lorem ipsum dolor sit amet (\d+)'
E_D_pattern = r'Lorem ipsum dolor sit amet \(“Lorem ipsum dolor sit amet”\), Lorem ipsum dolor sit amet (\w+ \d{1,2}, \d{4})'
L_pattern = r'“Lorem ipsum dolor sit amet”\)\.(\s+)(\w+[\s\w]*),'

def extract_contract_data(pdf_file):
    # Open the PDF file.
    doc = fitz.open(pdf_file)

    # Initialize variables to store extracted information.
I_T= None
    R_P= None
    R_T= None
    NON_R= None
    E_D= None
    L = None

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        # Extract the text from the current page.
        text = page.get_text()

        # Use regular expressions to find the desired information.
        I_T_match= re.search(I_T_Pattern, text)
        R_P_match= re.search(R_P_pattern, text)
        R_T_match= re.search(R_T_pattern , text)
        NON_R_match = re.search(NON_R_pattern , text)
        E_D_match  = re.search(E_D_pattern, text)
        L_match = re.search(L_pattern, text)

        # If the information is found and not already extracted, store it in the corresponding variable.
        if I_T_matchand not I_T:
        I_T= I_T_match.group(1)

        if R_P_matchand not R_P:
            R_P= R_P_match.group(1)

        if R_T_matchand not R_T:
            R_T= R_T_match.group(1)

        if NON_R_match and not NON_R:
            NON_R= NON_R_match.group(1)

        if E_D_match  and not E_D:
            E_D= E_D_match .group(1)

        if L_match and not L:
            L = L_match.group(2)

    # Close the PDF file.
    doc.close()

    return {
        "L": L,
        "E D": E_D,
        "I T": I_T,
        "R P": R_P,
        "R T": R_T,
        "NON R": NON_R
    }


def main():
    # Get the path to the directory that contains the PDFs.
    pdf_dir = r'C:\\path'

    # Create a list of all the PDF files in the directory.
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

    # Create an empty list to store the data found in the PDFs.
    data = []

    # Iterate over the PDF files.
    for pdf_file in pdf_files:
        # Extract contract data from each PDF.
        contract_data = extract_contract_data(pdf_file)
        data.append(contract_data)

   # Define the output file path.
    output_file = r'C:\\path'

    with open(output_file, "w", newline='') as file:
        fieldnames = ["L", "E D", "I T", "R P", "R T", "NON R"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(data)

    print(f"output: {output_file}")

# Call the main function to run the code.
main()

You may also want to share your example PDF, your expected output and your observed output. That being said, `R_P_pattern` and `R_T_pattern` look identical, and `NON_R_pattern` looks like a sub-pattern thereof. Thus, you shouldn't be surprised at all when `R_P` and `R_T` turn out identical, and `NON_R` also has a good chance to be identical. — mkl, Jun 21 '23 at 17:14
@mkl, I expect OP's regexes are off, too (see my answer below). — Zach Young, Jun 21 '23 at 17:29

score 0 · Answer 1 · answered Jun 21 '23 at 17:21

I think your regexes are off.

I used your logic with this PDF and this code which looks for two literal strings on Pg 1 and one literal string on Pg 4:

import csv
import os
import re

import fitz

# Define regular expressions patterns to match the desired information.
Pg1_Op_Manual = r"OPERATOR’S MANUAL"
Pg1_Model = r"BTS20R-1"
Pg4_Explosive = r"NEVER USE IN AN EXPLOSIVE ATMOSPHERE."


def extract_contract_data(pdf_file):
    # Open the PDF file.
    doc = fitz.open(pdf_file)

    # Initialize variables to store extracted information.
    Op_Manual = None
    Model = None
    Explosive = None

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        # Extract the text from the current page.
        text = page.get_text()

        # Use regular expressions to find the desired information.
        Manual_Match = re.search(Pg1_Op_Manual, text)
        Model_Match = re.search(Pg1_Model, text)
        Explosive_Match = re.search(Pg4_Explosive, text)

        # If the information is found and not already extracted, store it in the corresponding variable.
        if Manual_Match and not Op_Manual:
            Op_Manual = Manual_Match.group(0)

        if Model_Match and not Model:
            Model = Model_Match.group(0)

        if Explosive_Match and not Explosive:
            Explosive = Explosive_Match.group(0)

    # Close the PDF file.
    doc.close()

    return {"Op_Manual": Op_Manual, "Model": Model, "Explosive": Explosive}


def main():
    # Get the path to the directory that contains the PDFs.
    pdf_dir = r"."

    # Create a list of all the PDF files in the directory.
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

    # Create an empty list to store the data found in the PDFs.
    data = []

    # Iterate over the PDF files.
    for pdf_file in pdf_files:
        # Extract contract data from each PDF.
        contract_data = extract_contract_data(pdf_file)
        data.append(contract_data)

    # Define the output file path.
    output_file = r"output.csv"

    with open(output_file, "w", newline="", encoding="utf-8") as file:
        fieldnames = ["Op_Manual", "Model", "Explosive"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(data)

    print(f"output: {output_file}")


# Call the main function to run the code.
main()

and I get the expected CSV:

Op_Manual,Model,Explosive
OPERATOR’S MANUAL,BTS20R-1,NEVER USE IN AN EXPLOSIVE ATMOSPHERE.

PyMuPdf extract pdf information into a csv file, from multiple files. Why is this code only extracting data from the first page of each PDF?

1 Answers1