Convert very large PDF to images with python

Question

I have an extremely large PDF containing scans that are approximately 30.000px wide (wtf!). I have a python script that works well for normal sized PDF but when confronted to this large PDF outputs only 1 pixel wide white squares as images.

The problem occurs at the convert_from_path step since images in save_img are already one unique pixel.

from PIL import Image
from pdf2image import pdfinfo_from_path, convert_from_path
from pathlib import Path

Image.MAX_IMAGE_PIXELS = None

def pdf_to_img(pdf_path, dpi=500):
    """
    Convert the PDF file to JPEG images
    """
    pdf_name = pdf_path.split("/")[-1].split(".")[0]
    pdf_info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)
    page_nb = pdf_info["Pages"]
    step = 2
    try:
        for img_nb in range(1, page_nb + 1, step):
            batch_pages = convert_from_path(
                pdf_path,
                dpi=dpi,
                first_page=img_nb,
                last_page=min(img_nb + step - 1, page_nb),
            )
            for page in batch_pages:
                save_img(page, f"{pdf_name}_{img_nb:04d}.jpg")
                img_nb += 1
    except Exception as e:
        print(f"[pdf_to_img] Failed to convert {pdf_name}.pdf to images:\n{e} ({e.__class__.__name__})")


def save_img(
    img,
    img_filename,
    img_path=Path("./output"),
    error_msg="Failed to save img",
    max_dim=2500,
    img_format="JPEG",
):
    try:
        if img.width > max_dim or img.height > max_dim:
            img.thumbnail(
                (max_dim, max_dim), Image.Resampling.LANCZOS
            )
        img.save(img_path / img_filename, format=img_format)
        return True
    except Exception as e:
        print(f"[save_img] {error_msg}:\n{e} ({e.__class__.__name__})")
    return False

Do you know what I can do or how to improve my code? I have tried several libraries (wand, reportlab) with no success.. Thank you so much for your help!

You may want to look at PyMuPDF. It can directly render pages, and requires no extra packages. — Jorj McKie, Jul 11 '23 at 09:11

score 1 · Answer 1 · answered Jul 11 '23 at 15:28

I manage to convert pdf into images using popplers-utils and the subprocess module:

import subprocess

def convert_large_pdf(pdf_path):
    pdf_name = pdf_path.split("/")[-1].split(".")[0]
    output_folder = Path("./output")
    command = f"pdftoppm -jpeg -r 300 -scale-to 2500 {pdf_path} {output_folder / pdf_name} -sep _ "
    subprocess.run(command, shell=True, check=True)

Convert very large PDF to images with python

1 Answers1