I am trying to iterate through many PDF files to extract their text and place them into an excel file. pdfminer3 has allowed me to do so with only one PDF file but I am having trouble with iterating through many PDF files.
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io
import os
import pandas as pd
pm=[]
directory='location of folder with PDF files'
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
with open(os.path.join(directory,file), 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
new = text.replace("\n"," ")
new_text=new.replace(""," ")
pm.append(new_text)
converter.close()
fake_file_handle.close()
# close open handles