1

I have a large pdf file with very specific formatting, a bunch of reports if you will, all in one big pdf document. I'm using pdfplumber to extract specific text within a bounding box on each page. I've called this variable scene_text. The value of scene_text changes throughout the document, but many pages contain the same value for scene_text. I want to separate the large pdf into multiple smaller pdf files named according to their scene_text value with each pdf file containing all of the pages with matching scene_text. I'm terribly stuck, any help would be appreciated.

import pdfplumber
from PyPDF2 import PdfFileWriter, PdfFileReader
import os

file = 'report.pdf'

with pdfplumber.open(file) as pdf:
    for i, page in enumerate(pdf.pages):
        # get scene text for current page
        bounding_box = (880, 137, 1048, 180)
        scene_text = page.within_bbox(bounding_box, relative=True).extract_text()
        previous_page_text = pdf.pages[i-1].within_bbox(bounding_box, relative=True).extract_text()

        inputpdf = PdfFileReader(open(file, "rb"))
        output = PdfFileWriter()
        for x, page in enumerate(pdf.pages):
            st2 = page.within_bbox(bounding_box, relative=True).extract_text()
            if st2 != previous_page_text:
                output.addPage(inputpdf.getPage(i))
            if st2 == scene_text:
                if st2 == pdf.pages[x+1].within_bbox(bounding_box, relative=True).extract_text():
                    previous_page_text = st2
            
            with open("page_export/" + scene_text + ".pdf", "wb") as output_stream:
                    output.write(output_stream)
John
  • 11
  • 1

0 Answers0