Crop PDF content with Python, not just the cropbox

Question

I am trying to create a script that crops parts of a PDF, merges them into a single page, and saves the result to another PDF file. The problem is that when I change the crop box and merge the page, it keeps the cropped data and just hides it. This is a problem because I want to process the output PDF with a parser that extracts the text from the page, rather than using OCR.

Does anyone know how to crop the page and delete the data outside the bounding box?

For example: In this PDF file, I want to crop the two blue boxes and merge them into a single page output file. But when I do this and later try to select the text, it still includes the hidden text.

from fitz import Document, Page, Rect

# Define a list of boxes to extract from the input PDF file
# Each box is a dictionary containing the page number and the rectangle coordinates of the box on that page, represented as a Rect object
boxes = [
    {
        'page_number': 0,
        'rect': Rect(0, 54, 595, 189)
    },
    {
        'page_number': 0,
        'rect': Rect(0, 317, 595, 459)
    }
]

# Calculate the dimensions of the new page as the maximum width and sum of the heights of all the boxes
new_page_rect = Rect(
    0,
    0,
    max([box['rect'].width for box in boxes]) + 1,
    sum([box['rect'].height for box in boxes]) + 1
)

# Open the input PDF file and create an output PDF file
with Document(r"lorem_ipsum.pdf") as input_document, Document() as output_document:

    # Create a new page in the output document with the calculated width and height
    new_page: Page = output_document.new_page(
        width=new_page_rect.width,
        height=new_page_rect.height
    )

    # Initialize the y-coordinate for the top of the current box
    last_y_coord = 0

    # Iterate through each box in the list of boxes
    for box in boxes:
        # Copy the page from the input document
        input_document.copy_page(box['page_number'])
        # Get the copied page
        page = input_document[-1]
        # Set the crop box of the copied page to the rectangle coordinates of the box
        page.set_cropbox(box['rect'])

        # Calculate the rectangle coordinates for the current box on the new page
        rect = Rect(
            0,
            last_y_coord,
            box['rect'].width,
            last_y_coord + box['rect'].height,
        )

        # Update the y-coordinate for the top of the next box to the bottom of the current box
        last_y_coord = rect.y1 + 1

        # Draw the copied page onto the new page using the calculated rectangle coordinates
        new_page.show_pdf_page(rect, input_document, page.number)

    # Save the output document as a PDF file
    output_document.save(filename=r"output_PyMuPDF.pdf", garbage=3, deflate=True, pretty=True)

I thought the problem could be with the PyMuPDF library, but I tried the equivalent code with PyPDF2 and got the same problem.

import io
import PyPDF2
from PyPDF2 import Transformation
from copy import copy

# Define a list of boxes to extract from the input PDF file
# Each box is a dictionary containing the page number and the rectangle coordinates of the box on that page
boxes = [
    {
        'page_number': 0,
        'rect': (0, 54, 595, 189)
    },
    {
        'page_number': 0,
        'rect': (0, 317, 595, 459)
    }
]

# Calculate the width of the new page as the maximum width of all the boxes
new_page_width = max([box['rect'][2] - box['rect'][0] for box in boxes]) + 1

# Calculate the height of the new page as the sum of the heights of all the boxes
new_page_height = sum([box['rect'][3] - box['rect'][1] for box in boxes]) + 1

# Open the input PDF file and create an output PDF file
with open(r"lorem_ipsum.pdf", "rb") as input_file, open(r"output_PyPDF2.pdf", "wb") as output_file:

    # Create a PDF reader object to read the input PDF file
    reader = PyPDF2.PdfFileReader(input_file)

    # Create a PDF writer object to write the output PDF file
    writer = PyPDF2.PdfFileWriter()

    # Clone the input PDF file using the writer object, so that we can modify it without changing the original file
    temp_writer = PyPDF2.PdfFileWriter()
    temp_writer.clone_document_from_reader(reader=reader)

    # Create a blank page with the calculated width and height using the PyPDF2 PageObject class
    new_page = PyPDF2.PageObject.create_blank_page(
        pdf=None,
        width=new_page_width,
        height=new_page_height
    )

    # Initialize the y-coordinate for the top of the new page
    last_y_coord = new_page_height

    # Iterate through each box in the list of boxes
    for box in boxes:
        # Extract the corresponding page from the reader object
        page = copy(reader.getPage(box['page_number']))

        # Get the height of the page
        page_height = page.mediabox.upper_right[1]

        # Calculate the coordinates of the top-left and bottom-right corners of the box
        x0 = box['rect'][0]
        y0 = page_height - box['rect'][3]
        x1 = box['rect'][2]
        y1 = page_height - box['rect'][1]

        # Calculate the translation transformation to apply to the page
        # The transformation moves the page horizontally by the distance from the left edge of the page to the left edge of the box
        # and moves the page vertically by the distance from the top of the current box to the bottom of the last added page
        tx = -x0
        ty = last_y_coord - y1

        # Create a transformation object using the PyPDF2 Transformation class
        transformation = Transformation().translate(
            tx=tx,
            ty=ty
        )

        # Apply the transformation to the page
        page.add_transformation(transformation)

        # Update the page's cropbox to reflect the transformation
        page.cropbox.lower_left = (x0, y0 + ty)
        page.cropbox.upper_right = (x1, y1 + ty)

        # Merge the transformed page onto the new page
        new_page.merge_page(page)

        # Update the y-coordinate for the top of the next box to the top of the current box
        last_y_coord -= (y1 - y0 + 1)

    # After all boxes have been processed, add the new page to the writer object
    writer.addPage(new_page)

    # Write the output PDF file using the writer object
    writer.write(output_file)

An alternative approach is : take images of marked box. run OCR to get text for these images. for extra check convert pdf to text using pdftext. (compare ocr output with pdf text) — Gunesh Shanbhag, Dec 30 '22 at 16:51

score 1 · Answer 1 · answered Dec 31 '22 at 08:49

With PyMuPDF you have the option to (temporarily) remove stuff outside interesting areas on the input page before you use that page in method show_pdf_page() of the target page. This works with using redaction annotations. "Temporarily" means you would be modifying the source page and afterwards would not save the source PDF.

If you have a list include of rectangles on the same source page, whose content should be included, you would have to compute a list exclude of rectangles of the source page you do not want, IAW the complement areas. Using redactions annotations, remove content of all rectangles in exclude before using that page in show_pdf_page(). Snippet:

for rect in exclude:
    source_page.add_redact_annot(rect)
source_page.apply_redactions()  # clears areas to exclude
for rect in include:
    target_page.show_pdf_page(<target_rect>, source, source_page.number, clip=rect)

score 0 · Accepted Answer · answered Dec 31 '22 at 19:54

As @Jorj McKie mentioned, I can achieve what I want using the page.add_redact_annot() and page.apply_redactions() functions. In my specific case, I need to create boxes around the box that I want to show in order to cover the rest of the page, like this:

    for box in boxes:
        # Copy the page from the input document
        input_document.copy_page(box['page_number'])
        # Get the copied page
        page = input_document[-1]
        page_original_height = page.cropbox.height
        page_original_width = page.cropbox.width
        # Set the crop box of the copied page to the rectangle coordinates of the box
        page.set_cropbox(box['rect'])

        # Calculate the rectangle coordinates for the current box on the new page
        rect = Rect(
            0,
            last_y_coord,
            box['rect'].width,
            last_y_coord + box['rect'].height,
        )

        # Calculate the rectangles for the areas to be redacted
        redact_rects = []
        if rect.x0 > 0:
            page.add_redact_annot(Rect(0, 0, rect.x0, page_original_height))
        if rect.y0 > 0:
            page.add_redact_annot(Rect(0, 0, page_original_width, rect.y0))
        if rect.x1 < page_original_width:
            page.add_redact_annot(Rect(rect.x1, 0, page_original_width, page_original_height))
        if rect.y1 < page_original_height:
            page.add_redact_annot(Rect(0, rect.y1, page_original_width, page_original_height))

        page.apply_redactions()

        # Update the y-coordinate for the top of the next box to the bottom of the current box
        last_y_coord = rect.y1 + 1

        # Draw the copied page onto the new page using the calculated rectangle coordinates
        new_page.show_pdf_page(rect, input_document, page.number)

Crop PDF content with Python, not just the cropbox

2 Answers2