I want to gather all PDF files from my computer and extract the text from each one. Both functions that I have currently do that, however, some PDF files are giving me this error:
raise PDFPasswordIncorrect
pdfminer.pdfdocument.PDFPasswordIncorrect
I raised the error in the function that open and reads the PDF files, and that seemed to work in terms of ignoring the error but now its ignoring all the PDF files including the good ones that were not an issue before.
How can I make it so it only ignores the PDF files that give me this error and not every single PDF?
def pdfparser(x):
try:
raise PDFPasswordIncorrect(pdfminer.pdfdocument.PDFPasswordIncorrect)
fp = open(x, 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
except (RuntimeError, TypeError, NameError,ValueError,IOError,IndexError,PermissionError):
print("Error processing {}".format(name))
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
return(data)
def pdfs(files):
for name in files:
try:
IP_list = (pdfparser(name))
keyword = re.findall(inp,IP_list)
file_dict['keyword'].append(keyword)
file_dict['name'].append(name.name[0:])
file_dict['created'].append(time.ctime(name.stat().st_ctime))
file_dict['modified'].append(time.ctime(name.stat().st_mtime))
file_dict['path'].append(name)
file_dict["content"].append(IP_list)
except (RuntimeError, TypeError, NameError,ValueError,IOError,IndexError,PermissionError):
print("Error processing {}".format(name))
#print(file_dict)
return(file_dict)
pdfs(files)