enter image description hereI need help to extract text from scanned pdf. I have tried to extract it using pymupdf and pillow and pytesseract, but I am not getting correct results, there are some text are returned incorrectly. I tried to increase sharpness and brightness but still did not get a good result.
I have already checked many answers using OpenCV, but I am fairly new to OpenCV. Please help.
def pdf_to_text(pdf_file,text_file_name,rotate_pdf=False,adj_sharpness=False,adj_contract=False,adj_brightness=False):
try:
doc = fitz.open(pdf_file)
zoom_x=2.5
zoom_y=2.5
mat = fitz.Matrix(zoom_x,zoom_y)
files = []
for n in range(doc.page_count):
#print(f'Extracting {n} image')
page = doc.load_page(n)
if rotate_pdf:
page.set_rotation(-90)
#pix = page.get_pixmap(dpi=600)
pix = page.get_pixmap(alpha=False,matrix=mat,dpi=300)
folder=os.path.join(os.getcwd(),"images")
if not os.path.exists(folder):
os.makedirs(folder)
fname = os.path.join(folder,"page-%i.png"%n)
pix.save(fname)
im = Image.open(fname)
im = adjust_sharpness(im,2.5)
im = adjust_brightness(im,1.1)
im = adjust_contrast(im,2.8)
#im = im.filter(ImageFilter.SMOOTH)
im.save(fname)
#remove_lines(fname)
files.append(fname)
#if n>1:
# break
print("Extracting Images Completed")
print("Now Extracting data from image file")
for file in files:
#file = "./images/page-0.png"
text = image_to_string(file, lang_code="eng")
#text = image_to_string(file, lang_code="fra+eng")
make_textfile(text, text_file_name)
print("Extracting and saving text files completed")
except FileNotFoundError:
print(f"File not available {pdf_file}")
return None
pytesseract.image_to_string(image=Image.open(image_name))
The image: