8

I am trying to strip out only the first page of multiple PDF files and combine into one file. (I receive 150 PDF files a day, the first page is the invoice which I need, the following three to 12 pages are just backup which I do not need) So the input is 150 PDF files of varying size and the output I want is 1 PDF file containing only the first page of each of the 150 files.

What I seem to have done is to have merged all the pages EXCEPT the first page (which is the only one I need).

# Get all PDF documents in current directory
import os

pdf_files = []
for filename in os.listdir("."):
    if filename.endswith(".pdf"):
        pdf_files.append(filename)
pdf_files.sort(key=str.lower)

# Take first page from each PDF
from PyPDF2 import PdfFileWriter, PdfFileReader

for filename in pdf_files:
    reader = PdfFileReader(filename)

writer = PdfFileWriter()
for pageNum in range(1, reader.numPages):
    page = reader.getPage(pageNum)
    writer.addPage(page)

with open("CombinedFirstPages.pdf", "wb") as fp:
    writer.write(fp)
Martin Thoma
  • 124,992
  • 159
  • 614
  • 958
mike horan
  • 81
  • 1
  • 2

3 Answers3

2

Try this:

# Get all PDF documents in current directory
import os

your_target_folder = "."
pdf_files = []
for dirpath, _, filenames in os.walk(your_target_folder):
    for items in filenames:
        file_full_path = os.path.abspath(os.path.join(dirpath, items))
        if file_full_path.lower().endswith(".pdf"):
            pdf_files.append(file_full_path)
pdf_files.sort(key=str.lower)

# Take first page from each PDF
from PyPDF2 import PdfFileReader, PdfFileWriter

writer = PdfFileWriter()

for file_path in pdf_files:
    reader = PdfFileReader(file_path)
    page = reader.getPage(0)
    writer.addPage(page)

with open("CombinedFirstPages.pdf", "wb") as output:
    writer.write(output)
Martin Thoma
  • 124,992
  • 159
  • 614
  • 958
DRPK
  • 2,023
  • 1
  • 14
  • 27
  • Unfortunately, the above script gives me an empty documents with no pages. – mike horan Nov 11 '17 at 16:29
  • @mikehoran: see : https://ufile.io/hh2v4, this is ok with my code! i think something wrong on your addressing! whats your target folder? did you put your target folder on your_target_folder = "" ? – DRPK Nov 14 '17 at 11:01
0

Did some changes. The following piece of code worked for me.

import os
from PyPDF2 import PdfWriter, PdfReader

pdf_files = []
# Get all PDF documents in current directory
for filename in os.listdir("."):
    if filename.endswith(".pdf"):
        pdf_files.append(filename)
pdf_files.sort(key=str.lower)

# Take first page from each PDF    

pdf_writer = PdfWriter()

for filename in pdf_files:
    reader = PdfReader(filename)
    page = reader.pages[0]
    pdf_writer.add_page(page)


with open("CombinedFirstPages.pdf", "wb") as fp:
    pdf_writer.write(fp)
Jordan
  • 1
  • 2
-1

This script takes all the PDF files and converts the first page to png. In the current execution directory

#pip install pdf2image
import os
import tempfile
from pdf2image import convert_from_path
output_folder=os.getcwd() #current work directory

def pdf_to_png(pdf_name,source,destino):
    with tempfile.TemporaryDirectory() as path:
            images_from_path = convert_from_path(pdf_path=source+"/"+pdf_name,
            dpi=100,
            output_folder=destino,
            fmt="png",
            output_file=pdf_name[:-4],
            single_file=True)
            
for filename in os.listdir(output_folder):
    if filename.endswith(".pdf"):
        pdf_to_png(filename,output_folder,output_folder)

print("ok!")
ivansaul
  • 179
  • 2
  • 6