I am extracting the pdf to text using python and libraries like, fitz, pdfreader and so on. But in my pdf, there are some schematics and words I do not need on it.
Here is an example.
When extracting the text, the words of the schematics are also included, but I do not want those words to appeare. Because if the image can be extrated the text in the images is not meaninful.
I could not come up with a strategy to delete these useless words from the pdf.
import fitz
from io import BytesIO
class DeleteGarbage(object):
def __init__(self, max_table_area=1.5):
self.max_table_area = max_table_area
def process(self, context):
'''extract page content and does basic filtering using fitz'''
for page_number, page in enumerate(context["fitz"]):
if page_number != 2:
continue
area_of_page = page.rect.width * page.rect.height
paths = page.get_drawings() # extract existing drawings
for path in paths:
for item in path["items"]:
if item[0] == "l": # line
rect = [item[1][0], item[1][1], item[2][0], item[2][1]]
if self.check_if_not_table(rect, page_number, context['content']['pages'][page_number - 1]['tables']):
rect = [item[1][0] - 10, item[1][1] - 10, item[2][0] + 10, item[2][1] + 10]
white = (1, 1, 1)
black = (0, 0, 0)
page.add_redact_annot(rect, f"", align=fitz.TEXT_ALIGN_CENTER, fill=white, text_color=white)
elif item[0] == "re": # rectangle
rect = item[1]
if rect.get_area() < area_of_page / self.max_table_area and self.check_if_not_table(rect, page_number, context['content']['pages'][page_number - 1]['tables']):
white = (1, 1, 1)
black = (0, 0, 0)
page.add_redact_annot(
[rect[0] - 10, rect[1] - 10, rect[2] + 10, rect[3] + 10],
f"",
align=fitz.TEXT_ALIGN_CENTER,
fill=white,
text_color=white
)
page.apply_redactions()
return context
def check_if_not_table(self, rect, page_number, tables):
for table_coordination in tables['coordination']:
if table_coordination[0] - 10 < rect[0] and table_coordination[1] - 10 < rect[1] and table_coordination[2] + 10 > rect[2] and table_coordination[3] + 10 > rect[3]:
return False
return True