I made a script to batch-process pdf scans into text with the help of tesseract and pyocr. The code is below. Problem is, when processing lots of files, like 20+, at some moment the script just runs out of memory and fails with OSError. I currently made it so that it can catch up smoothly at where it crashed after manual restart, but these manual restarts are tedious.
Since pyocr is basically a black box to me, I tried wrapping the script into other Python scripts that would restart it on crash, but they all seem to go down on that error, only freeing the memory when every related script terminates.
The only other solution I can think of, is to make a completely external wrapper, that would check if the script is running and restart it if not AND there are still unprocessed files.
But maybe there is a better solution? Or maybe I made lame code that can be improved to avoid these memory crashes? (Other then that I know it is lame, but works good enough :) ).
from io import BytesIO
from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io
import os
import os.path
import ast
def daemon_ocr(tool, img, lang):
txt = tool.image_to_string(
PI.open(BytesIO(img)),
lang=lang,
builder=pyocr.builders.TextBuilder()
)
return txt
def daemon_wrap(image_pdf, tool, lang, iteration):
print(iteration)
req_image = []
final_text = ''
image_pdf_bckp = image_pdf
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
for img in req_image:
txt = daemon_ocr(tool, img, lang)
final_text += txt + '\n '
if 'работ' not in final_text and 'фактура' not in final_text and 'Аренда' not in final_text and 'Сумма' not in final_text\
and 'аренде' not in final_text and 'товара' not in final_text:
if iteration < 5:
iteration += 1
image_pdf = image_pdf.rotate(90)
final_text = daemon_wrap(image_pdf_bckp, tool, lang, iteration)
return final_text
def daemon_pyocr(food):
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[0]
iteration = 1
image_pdf = Image(filename='{doc_name}'.format(doc_name=food), resolution=300)
final_text = daemon_wrap(image_pdf, tool, lang, iteration)
return final_text
files = [f for f in os.listdir('.') if os.path.isfile(f)]
output = {}
print(files)
path = os.path.dirname(os.path.abspath(__file__))
if os.path.exists('{p}/output'.format(p=path)):
text_file = open("output", "a")
first = False
else:
text_file = open("output", "w")
first = True
for f in files:
if f != 'ocr.py' and f != 'output':
try:
output[f] = daemon_pyocr(f)
print('{f} done'.format(f=f))
if first:
text_file.write(str(output)[1:-1])
first = False
else:
text_file.write(', {d}'.format(d=str(output)[1:-1]))
output = {}
os.rename('{p}/{f}'.format(p=path, f=f), "{p}/done/{f}".format(p=path, f=f))
except OSError:
print('{f} failed: not enough memory.'.format(f=f))