I want to use preprocessed image
as an input to def infer_text
that will return me annotations
. SO, how can I do this, what should I pass to infer_text
function?
image_folder = Path("/home/Tasks/NM_spanish/Invoices_pdf")
def get_preprocessed_image(image_path: pathlib.Path) -> PIL.Image.Image:
try:
with image_path.open("rb") as image_input:
return PIL.Image.open(io.BytesIO(pyabbyy.preprocess(image_input.read())))
except (RuntimeError, AttributeError):
print(image_path)
def preprocess_image_folder_(image_folder: pathlib.Path) -> None:
for image_path in tqdm.tqdm(list(image_folder.rglob("*.pdf"))):
try:
get_preprocessed_image(image_path).save(image_path)
except (RuntimeError, AttributeError):
print(image_path)
def infer_text(image_path: pathlib.Path) -> graphanno.GraphAnnotation:
with image_path.open("rb") as img:
words = pyabbyy.read_text(img.read(), preprocess=False)
nodes = []
for word in words:
box = geometric.Box(
origin_x=word["origin_x"],
origin_y=word["origin_y"],
width=word["max_x"] - word["origin_x"],
height=word["max_y"] - word["origin_y"],
)
nodes.append(graphanno.Node(text=word["text"], box=box))
num_nodes = len(nodes)
annotation = graphanno.GraphAnnotation(
tuple(nodes),
graphanno.Adjacency(np.zeros((num_nodes, num_nodes))),
graphanno.Adjacency(np.zeros((num_nodes, num_nodes))),
graphanno.Adjacency(np.zeros((num_nodes, num_nodes))),
)
return annotation
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
preprocess_image_folder_(image_folder)