I have a large pdf file with very specific formatting, a bunch of reports if you will, all in one big pdf document. I'm using pdfplumber to extract specific text within a bounding box on each page. I've called this variable scene_text. The value of scene_text changes throughout the document, but many pages contain the same value for scene_text. I want to separate the large pdf into multiple smaller pdf files named according to their scene_text value with each pdf file containing all of the pages with matching scene_text. I'm terribly stuck, any help would be appreciated.
import pdfplumber
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
file = 'report.pdf'
with pdfplumber.open(file) as pdf:
for i, page in enumerate(pdf.pages):
# get scene text for current page
bounding_box = (880, 137, 1048, 180)
scene_text = page.within_bbox(bounding_box, relative=True).extract_text()
previous_page_text = pdf.pages[i-1].within_bbox(bounding_box, relative=True).extract_text()
inputpdf = PdfFileReader(open(file, "rb"))
output = PdfFileWriter()
for x, page in enumerate(pdf.pages):
st2 = page.within_bbox(bounding_box, relative=True).extract_text()
if st2 != previous_page_text:
output.addPage(inputpdf.getPage(i))
if st2 == scene_text:
if st2 == pdf.pages[x+1].within_bbox(bounding_box, relative=True).extract_text():
previous_page_text = st2
with open("page_export/" + scene_text + ".pdf", "wb") as output_stream:
output.write(output_stream)