1

I am trying to extract the text in doem PDF files using Textract. However, when I print the text in the end of the code, it just prints out a lot of empty spaces. Can anyone point me in direction of what is going on? (text is not = "", by the way)

import os
import codecs
import PyPDF2 
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

for filename in os.listdir('Harbour PDF'):
    if '.DS_Store' == filename:
        continue
    filename = 'Harbour PDF/' + filename
    print(filename)

    pdfFileObj = open(filename,'rb')

    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    num_pages = pdfReader.numPages
    count = 0
    text = ""

    while count < num_pages:
        pageObj = pdfReader.getPage(count)
        count +=1
        text += pageObj.extractText()


    if text != "":
        text = text
    else:
        text = textract.process(pdfFileObj, method='tesseract', language='eng')

    print(text)
Alderven
  • 7,569
  • 5
  • 26
  • 38

1 Answers1

0

2 functions I use via python (2nd one needs tesseract). Well, I actually prefer the tesseract one instead of pdfminer, but they effectively do the same thing. Not sure whats wrong with your code, but these are alternative equivalents I believe.

from PIL import Image
import pytesseract
import cv2
import os
import subprocess

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text

def to_txt(pdf_path, output_dir, name=None):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if name == None:
        parts = pdf_path.split('\\')
        fname = parts[-1][:-4] + '.txt'
        fname = fname.replace(" ", "_")
        end = output_dir + fname
    else:
        if name[-4:] != '.txt':
            name += '.txt'
        end = output_dir + name
    cmd = ['pdftotext', pdf_path,
           end]
    subprocess.call(cmd)
    print('Converted')

Evan Mata
  • 500
  • 1
  • 6
  • 19