0

I have an extremely large PDF containing scans that are approximately 30.000px wide (wtf!). I have a python script that works well for normal sized PDF but when confronted to this large PDF outputs only 1 pixel wide white squares as images.

The problem occurs at the convert_from_path step since images in save_img are already one unique pixel.

from PIL import Image
from pdf2image import pdfinfo_from_path, convert_from_path
from pathlib import Path

Image.MAX_IMAGE_PIXELS = None

def pdf_to_img(pdf_path, dpi=500):
    """
    Convert the PDF file to JPEG images
    """
    pdf_name = pdf_path.split("/")[-1].split(".")[0]
    pdf_info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)
    page_nb = pdf_info["Pages"]
    step = 2
    try:
        for img_nb in range(1, page_nb + 1, step):
            batch_pages = convert_from_path(
                pdf_path,
                dpi=dpi,
                first_page=img_nb,
                last_page=min(img_nb + step - 1, page_nb),
            )
            for page in batch_pages:
                save_img(page, f"{pdf_name}_{img_nb:04d}.jpg")
                img_nb += 1
    except Exception as e:
        print(f"[pdf_to_img] Failed to convert {pdf_name}.pdf to images:\n{e} ({e.__class__.__name__})")


def save_img(
    img,
    img_filename,
    img_path=Path("./output"),
    error_msg="Failed to save img",
    max_dim=2500,
    img_format="JPEG",
):
    try:
        if img.width > max_dim or img.height > max_dim:
            img.thumbnail(
                (max_dim, max_dim), Image.Resampling.LANCZOS
            )
        img.save(img_path / img_filename, format=img_format)
        return True
    except Exception as e:
        print(f"[save_img] {error_msg}:\n{e} ({e.__class__.__name__})")
    return False

Do you know what I can do or how to improve my code? I have tried several libraries (wand, reportlab) with no success.. Thank you so much for your help!

Seglinglin
  • 447
  • 1
  • 4
  • 17

1 Answers1

1

I manage to convert pdf into images using popplers-utils and the subprocess module:

import subprocess

def convert_large_pdf(pdf_path):
    pdf_name = pdf_path.split("/")[-1].split(".")[0]
    output_folder = Path("./output")
    command = f"pdftoppm -jpeg -r 300 -scale-to 2500 {pdf_path} {output_folder / pdf_name} -sep _ "
    subprocess.run(command, shell=True, check=True)
Seglinglin
  • 447
  • 1
  • 4
  • 17