I am currently working on OCR on pdf files. Here is my pipeline:
- i first extract image from pdf (since my pdf contained scanned document) and convert in numpy array
- then i read with tesseract
It works pretty well on most of my image but i have sevral whose i can't extract the image inside. I just gave an example and i can't find (see next) the scanned image containing the writing part (for OCR). It drive me crazy (where has it gone ??).
Perhaps you could help me to retrieve that image and understand why my way do not let me retrieve this image "fantôme" ?
NB: i noticed that thoses problematic images inside the pdf are in "jpx" format.
Edit: Since the image is unfindable in the pdf i tried an horrible trick (waiting for clever explanation :) ): converting whole pdf page in pix (PyMuPdf let do that) and then writing the PIX on disk in different format (PNG, TIFF). The quality is too much degraded compared with the original pdf (so we can forget a reasonnable reading with Tesseract).
Here is the pdf example file (if you have simpler hosting way i am curious): https://www.filehosting.org/file/details/906817/IB00058815877D0000000.pdf
Here are the 2 images i extract from the file (the second one should contain txt instead of garbage)
Here is my code to extract images:
import fitz
import os
import logging
import cv2
from PIL import Image
from .utils import lazyproperty,showpdf
from .imhelpers import show
from ..config import myconfig
from impocr import logger
import pytesseract
pytesseract.pytesseract.tesseract_cmd = myconfig.TESSERACT_CMD
class InvalidImage(Exception):
pass
class PDFParser():
"""
"""
def __init__(self,filepath,page_num=0):
self.filepath = filepath
self.filename = os.path.basename(self.filepath).split('.pdf')[0]
try:
self._doc = fitz.open(filepath)
self.page_num = page_num
self._page = self._doc[page_num]
except Exception as e:
print("Lecture PDF impossible. {}".format(e))
raise
@lazyproperty
def text(self):
return self._page.getText()
@lazyproperty
def _pixs(self):
imgs = self._doc.getPageImageList(self.page_num)
pixs =[]
for img in imgs:
xref = img[0]
pix = fitz.Pixmap(self._doc, xref)
pixs.append(pix)
return pixs
@lazyproperty
def _pixpage(self):
pix = self._page.getPixmap(colorspace=fitz.csGRAY)
return pix
@property
def img(self):
return self.imgs[0]
@property
def pageimg(self):
pix = self._pixpage
return self.pix2np(pix)
@lazyproperty
def imgs(self):
pixs = self._pixs
imgsarray = []
for pix in pixs:
img = self.pix2np(pix)
imgsarray.append(img)
return imgsarray
def find_first_valid_image(self):
img_valid = None
for i,img in enumerate(self.imgs):
try:
import ipdb;ipdb.set_trace()
res = pytesseract.image_to_osd(img)
img_valid = img
return img_valid
except pytesseract.TesseractError:
continue
if img_valid==None:
logger.warning('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
raise InvalidImage('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
def write(self,outputdir,fullpage=False):
try:
os.makedirs(outputdir)
logger.info("Directory {} is created".format(outputdir))
except FileExistsError:
pass
def _writepix(pix,filepath):
# This is GRAY or RGB
try:
pix.writePNG(filepath)
# CMYK: convert to RGB first
except:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.writePNG(filepath)
pix = None
if fullpage:
filepath = os.path.join(outputdir,'{}_p{}.png'.format(self.filename,self.page_num))
pix = self._pixpage
_writepix(pix,filepath)
return
pixs = self._pixs
for i,pix in enumerate(pixs):
filepath = os.path.join(outputdir,'{}_p{}_i{}.png'.format(self.filename,self.page_num,i))
_writepix(pix,filepath)
return
def pix2np(self,pix):
"""
Convert pixmap to image np.ndarray
https://stackoverflow.com/questions/53059007/python-opencv
param pix: pixmap
"""
import numpy as np
#https://stackoverflow.com/questions/22236749/numpy-what-is-the-difference-between-frombuffer-and-fromstring
im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
try:
im = np.ascontiguousarray(im[..., [2, 1, 0]]) # rgb to bgr
except IndexError:
#Trick to convert Gray rto BGR, (im.reshape)
#logger.warning("Need to convert Gray to BGR [filepath: {}]".format(self.filepath))
im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
im = np.ascontiguousarray(im[..., [2, 1, 0]])
return im
if __name__ == "__main__":
filepath = r'data\inputs\test\impot_textpdf_with_one_logoimage.pdf'
###### Parse page 0 (first page) ######
pdf = PDFParser(filepath,0)
text = pdf.text
imgs = pdf.imgs
show(pdf.imgs[0])
show(pdf.imgs[1])
############### other functions ####################
class lazyproperty:
def __init__(self, func):
self.func = func
def __get__(self, instance, cls):
if instance is None:
return self
else:
value = self.func(instance)
setattr(instance, self.func.__name__, value)
return value
def show(image):
import matplotlib.pyplot as plt
fig,ax = plt.subplots(1)
ax.imshow(image)
plt.show()