convert from pdf to text: lines and words are broken

Question

I want to convert a pdf file to text by PyPDF2 but converted text looks differents from PDF file. Specifically, one line in PDF is broken into multiple lines in text and words may be broken as well. Attached is the PDF and the text file I got with the code below. Could anyone help me fix this issue?

from PyPDF2 import PdfFileReader


def extract_pdf_text(file_path=""):
    reader = PdfFileReader(file_path)

    text = ""

    # Loop over all the pdf pages.
    for page in reader.pages:
        text = text + page.extractText()
    return text

pdf_text = extract_pdf_text("PDF file path")

pdf file

converted text

PyPDF2 text extraction has several issues, but it got a lot of improvements since 2019. — Martin Thoma, May 01 '22 at 14:52

ASH · Accepted Answer · 2021-08-05T13:37:14.250

This is how I would do it.

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = io.StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
    if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in 
    for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
        fileExtension = pdf.split(".")[-1]
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf 
            text = convert(pdfFilename) #get string of text content of pdf
            textFilename = txtDir + pdf + ".txt"
            textFile = open(textFilename, "w") #make text file
            textFile.write(text) #write text to text file

# set paths accordingly:
pdfDir = "C://your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)

convert from pdf to text: lines and words are broken

1 Answers1

Linked