I am writing a script to take scanned pdf files and convert them into lines of text to enter into a database. I use re.findall to get matches from a list of regular expressions to get certain values from the tesseract extracted strings. I am having trouble when a regular expression can't find a match I want it to return "Error." So I can see that there is a problem.
I have tried a handful of if/else statements but I can't seem to get any to notice the None value.
from wand.image import Image as Img
import ghostscript
from PIL import Image
import pytesseract
import re
import os
def get_text_from_pdf(pendingpdf,pendingimg):
with Img(filename=pendingpdf, resolution=300) as img:
img.compression_quality = 99
img.save(filename=pendingimg)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
extractedtext = pytesseract.image_to_string(Image.open(pendingimg))
os.unlink(pendingimg)
return extractedtext
def get_results(vendor,extracted_string,results):
for v in vendor:
pattern = re.compile(v)
for match in re.findall(pattern,extracted_string):
if type(match) is str:
results.append(match)
else:
results.append("Error")
return results
pendingpdf = r'J:\TBHscan07022019090315001.pdf'
pendingimg = 'Test1.jpg'
aggind = ["^(\w+)(?:.+)\n+3600",
"Ticket: (nonsensewordstothrowerror)",
"Ticket: \d+\s([0-9|/]+)",
"Product: (\w+.+)\n",
"Quantity: ([\d\.]+)",
"Truck (\w+)"]
vendor = aggind
extracted_string = get_text_from_pdf(pendingpdf,pendingimg)
results = []
print(get_results(vendor,get_text_from_pdf(pendingpdf,pendingimg),results))