I have an extremely large PDF containing scans that are approximately 30.000px wide (wtf!). I have a python script that works well for normal sized PDF but when confronted to this large PDF outputs only 1 pixel wide white squares as images.
The problem occurs at the convert_from_path
step since images in save_img
are already one unique pixel.
from PIL import Image
from pdf2image import pdfinfo_from_path, convert_from_path
from pathlib import Path
Image.MAX_IMAGE_PIXELS = None
def pdf_to_img(pdf_path, dpi=500):
"""
Convert the PDF file to JPEG images
"""
pdf_name = pdf_path.split("/")[-1].split(".")[0]
pdf_info = pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None)
page_nb = pdf_info["Pages"]
step = 2
try:
for img_nb in range(1, page_nb + 1, step):
batch_pages = convert_from_path(
pdf_path,
dpi=dpi,
first_page=img_nb,
last_page=min(img_nb + step - 1, page_nb),
)
for page in batch_pages:
save_img(page, f"{pdf_name}_{img_nb:04d}.jpg")
img_nb += 1
except Exception as e:
print(f"[pdf_to_img] Failed to convert {pdf_name}.pdf to images:\n{e} ({e.__class__.__name__})")
def save_img(
img,
img_filename,
img_path=Path("./output"),
error_msg="Failed to save img",
max_dim=2500,
img_format="JPEG",
):
try:
if img.width > max_dim or img.height > max_dim:
img.thumbnail(
(max_dim, max_dim), Image.Resampling.LANCZOS
)
img.save(img_path / img_filename, format=img_format)
return True
except Exception as e:
print(f"[save_img] {error_msg}:\n{e} ({e.__class__.__name__})")
return False
Do you know what I can do or how to improve my code? I have tried several libraries (wand
, reportlab
) with no success.. Thank you so much for your help!