12

I have some sources and tried to code which extract some pages and create pdf files. I have a list which looks like this

information = [(filename1,startpage1,endpage1), (filename2, startpage2, endpage2), ...,(filename19,startpage19,endpage19)].

This is my code.

from PyPDF2 import PdfFileReader, PdfFileWriter

reader = PdfFileReader("example.pdf")

for page in range(reader.getNumPages() - 1):
    writer = PdfFileWriter()
    start = information[page][1]
    end = information[page][2]
    while start < end:
        writer.addPage(reader.getPage(start))
        start += 1
        output_filename = "{}_{}_page_{}.pdf".format(
            information[page][0], information[page][1], information[page][2]
        )
    with open(output_filename, "wb") as out:
        writer.write(out)

But the output is weird.. some has nothing inside and some has just one page in it. How can I correct this?

Martin Thoma
  • 124,992
  • 159
  • 614
  • 958
SSS
  • 621
  • 2
  • 7
  • 25

4 Answers4

11

I have fixed the issue. it was the equal sign (start<=end).

for page in range(len(information)):
    pdf_writer = PyPDF2.PdfFileWriter()
    start = information[page][1]
    end = information[page][2]
    while start<=end:
        pdf_writer.addPage(pdfReader.getPage(start-1))
        start+=1
    if not os.path.exists(savepath):
        os.makedirs(savepath)
    output_filename = '{}_{}_page_{}.pdf'.format(information[page][0],information[page][1], information[page][2])
    with open(output_filename,'wb') as out:
        pdf_writer.write(out)
SSS
  • 621
  • 2
  • 7
  • 25
8

Full code and I modified SSS' answer to be portable, flexible, and concurrent with multiple source pdfs. I couldn't test the performance difference between ThreadPoolExecutor and ProcessPoolExecutor, but I assumed that the extraction process is bounded by the reading and writing of PDFs rather than by getPage and addPage.

import concurrent.futures
from multiprocessing import freeze_support
from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter


def pdf_extract(pdf, segments):
    """
    pdf: str | Path
    segments: [(start, end), {'start': int, 'end': int}]
    """
    with open(pdf, 'rb') as read_stream:
        pdf_reader = PdfFileReader(read_stream)
        for segment in segments:
            pdf_writer = PdfFileWriter()
            # support {'start': 3, 'end': 3} or (start, end)
            try:
                start_page, end_page = segment['start'], segment['end']
            except TypeError:
                start_page, end_page = segment
            for page_num in range(start_page - 1, end_page):
                pdf_writer.addPage(pdf_reader.getPage(page_num))
            p = Path(pdf)
            ouput = p.parent / p.with_stem(f'{p.stem}_pages_{start_page}-{end_page}')
            with open(ouput, 'wb') as out:
                pdf_writer.write(out)


def __pdf_extract(pair):
    return pdf_extract(*pair)


def pdf_extract_batch(pdfs, workers=20):
    """
    pdfs = {pdf_name: [(1, 1), ...], ...}
    """
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        executor.map(__pdf_extract, pdfs.items())


if __name__ == '__main__':
    freeze_support()
    pdf_name = r'C:\Users\maste\Documents\long.pdf'
    segments = [(1, 1), {'start': 3, 'end': 5}]
    # Single
    pdf_extract(pdf_name, segments)
    # Batched (Concurrent)
    pdfs = {pdf_name: segments}
    # pdf_extract_batch(pdfs)
Elijah
  • 1,814
  • 21
  • 27
  • If you encounter a `xref table not zero-indexed`, check out [this](https://stackoverflow.com/questions/49939085/xref-table-not-zero-indexed-id-numbers-for-objects-will-be-corrected-wont-con) question. – bariod Apr 30 '22 at 15:53
  • `p.with_stem` - "not available". You can use the following function: ```def change_stem(fn, new_stem): stem = fn.split("/")[-1]; ext = stem.split(".")[-1]; stem_noext = stem[0:-len(ext)-1]; return stem_noext+new_stem+"."+ext; ``` – pds Jul 26 '22 at 16:16
1

The older answers are good, but there have been some changes after PyPDF version 3.0.0. Here's an updated alternative (basically the reader and writer classes were renamed, and getPage() is now pages[]). If you only need one fragment simply remove the first loop.

Important note: this works without changes with pypdf, since PyPDF2 seems deprecated.

from pypdf import PdfReader, PdfWriter


pdf_reader = PdfReader(source_pdf_file_path)
pages = [(1, 3), (2, 6)]
for page_indices in pages:
    pdf_writer = PdfWriter()  # we want to reset this when starting a new pdf
    for idx in range(page_indices[0] - 1, page_indices[1]):
        pdf_writer.add_page(pdf_reader.pages[idx])
    output_filename = f"{out_folder}/{source_pdf_file_path.stem}_{page_indices[0]}.pdf"
    with open(output_filename, "wb") as out:
        pdf_writer.write(out)
Pablo
  • 1,373
  • 16
  • 36
0

my quick solution :

    from PyPDF2 import PdfReader, PdfWriter
    pdf_file_path = 'file.pdf'
    file_base_name = pdf_file_path.replace('.pdf', '')

    pdf = PdfReader(pdf_file_path)
    print('ca passe avant le decoupage')
    pages = [0, 2, 4] # page 1, 3, 5
    pdfWriter = PdfWriter()

    for page_num in pages:
        pdfWriter.add_page(pdf.pages[page_num])

    with open('{0}_subset.pdf'.format(file_base_name), 'wb') as f:
        pdfWriter.write(f)
        f.close()
Aymen Azoui
  • 369
  • 2
  • 4