3

I am currently working on OCR on pdf files. Here is my pipeline:

  • i first extract image from pdf (since my pdf contained scanned document) and convert in numpy array
  • then i read with tesseract

It works pretty well on most of my image but i have sevral whose i can't extract the image inside. I just gave an example and i can't find (see next) the scanned image containing the writing part (for OCR). It drive me crazy (where has it gone ??).

Perhaps you could help me to retrieve that image and understand why my way do not let me retrieve this image "fantôme" ?

NB: i noticed that thoses problematic images inside the pdf are in "jpx" format.

Edit: Since the image is unfindable in the pdf i tried an horrible trick (waiting for clever explanation :) ): converting whole pdf page in pix (PyMuPdf let do that) and then writing the PIX on disk in different format (PNG, TIFF). The quality is too much degraded compared with the original pdf (so we can forget a reasonnable reading with Tesseract).

Here is the pdf example file (if you have simpler hosting way i am curious): https://www.filehosting.org/file/details/906817/IB00058815877D0000000.pdf

Here are the 2 images i extract from the file (the second one should contain txt instead of garbage) image2:logo image2: an error here...

Here is my code to extract images:

import fitz
import os
import logging
import cv2
from PIL import Image
from .utils import lazyproperty,showpdf
from .imhelpers import show
from ..config import myconfig
from impocr import logger
import pytesseract

pytesseract.pytesseract.tesseract_cmd = myconfig.TESSERACT_CMD

class InvalidImage(Exception):
     pass


class PDFParser():
    """

    """
    def __init__(self,filepath,page_num=0):
        self.filepath = filepath
        self.filename = os.path.basename(self.filepath).split('.pdf')[0]
        try:
            self._doc = fitz.open(filepath)
            self.page_num = page_num
            self._page = self._doc[page_num]
        except Exception as e: 
            print("Lecture PDF impossible. {}".format(e))
            raise
            
    @lazyproperty
    def text(self):
        return self._page.getText()
    

    @lazyproperty
    def _pixs(self):
        imgs = self._doc.getPageImageList(self.page_num)
        pixs =[]
        for img in imgs:
            xref = img[0]
            pix = fitz.Pixmap(self._doc, xref)
            pixs.append(pix)
        return pixs

    @lazyproperty
    def _pixpage(self):
        pix = self._page.getPixmap(colorspace=fitz.csGRAY)
        return pix
    
    @property   
    def img(self):
        return self.imgs[0]

    @property
    def pageimg(self):
        pix = self._pixpage
        return self.pix2np(pix)

    @lazyproperty
    def imgs(self):
        pixs = self._pixs
        imgsarray = []
        for pix in pixs:
            img = self.pix2np(pix)
            imgsarray.append(img)
        return imgsarray

    def find_first_valid_image(self):
        img_valid = None
        for i,img in enumerate(self.imgs):
            try:
                import ipdb;ipdb.set_trace()
                res = pytesseract.image_to_osd(img)
                img_valid = img
                return img_valid
            except pytesseract.TesseractError:
                continue
        if  img_valid==None:
            logger.warning('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
            raise InvalidImage('No readable image in page {} of the document {}'.format(self.page_num, self.filename))


    def write(self,outputdir,fullpage=False):
        try:
            os.makedirs(outputdir)
            logger.info("Directory {} is created".format(outputdir))
        except FileExistsError:
            pass
        def _writepix(pix,filepath):
            # This is GRAY or RGB
            try:       
                pix.writePNG(filepath)
            # CMYK: convert to RGB first
            except:               
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.writePNG(filepath)
                pix = None

        if fullpage:
            filepath = os.path.join(outputdir,'{}_p{}.png'.format(self.filename,self.page_num))
            pix = self._pixpage
            _writepix(pix,filepath)
            return
        pixs = self._pixs
        for i,pix in enumerate(pixs):
            filepath = os.path.join(outputdir,'{}_p{}_i{}.png'.format(self.filename,self.page_num,i))
            _writepix(pix,filepath)
        return



    def pix2np(self,pix):
        """
        Convert pixmap to image np.ndarray
        https://stackoverflow.com/questions/53059007/python-opencv
        param pix: pixmap
        """
        import numpy as np
        #https://stackoverflow.com/questions/22236749/numpy-what-is-the-difference-between-frombuffer-and-fromstring
        im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        try:
            im = np.ascontiguousarray(im[..., [2, 1, 0]])  # rgb to bgr
        except IndexError:
            #Trick to convert Gray rto BGR, (im.reshape)
            #logger.warning("Need to convert Gray to BGR [filepath: {}]".format(self.filepath)) 
            im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
            im = np.ascontiguousarray(im[..., [2, 1, 0]])
        return im



        
if __name__ == "__main__":
    filepath = r'data\inputs\test\impot_textpdf_with_one_logoimage.pdf'
    ###### Parse page 0 (first page) ######
    pdf = PDFParser(filepath,0)
    text = pdf.text
    imgs = pdf.imgs
    show(pdf.imgs[0])
    show(pdf.imgs[1])

############### other functions ####################
class lazyproperty:
    def __init__(self, func):
        self.func = func
    def __get__(self, instance, cls):
        if instance is None:
            return self
        else:
            value = self.func(instance)
            setattr(instance, self.func.__name__, value)
            return value

def show(image):
    import matplotlib.pyplot as plt
    fig,ax = plt.subplots(1)
    ax.imshow(image)
    plt.show()

curious
  • 201
  • 1
  • 10

1 Answers1

1

My solution is not so good (but waiting for better ideas but here are my 2 cents idea: i first write the full page and read it with opencv (i changed the method first_valid_image as you can see attached).

from tmpfile import TemporaryDirectory
def find_first_valid_image(self):
        #import ipdb;ipdb.set_trace()
        img_valid = None
        for i,img in enumerate(self.imgs):
            try:
                #import ipdb;ipdb.set_trace()
                res = pytesseract.image_to_osd(img)
                img_valid = img
                return img_valid
            except pytesseract.TesseractError:
                continue
        if  img_valid==None:
            logger.warning('No readable image in page {} of the document {}. Tried the fullpage.'.format(self.page_num, self.filename))
            with TemporaryDirectory() as tmpdirname:
                filepath = self.write(tmpdirname,fullpage=True)
                img_fullpage =cv2.imread(filepath)
            return img_fullpage

I think it degrade the quality of my original image; so when applying tesseract on the image i got a bad ocr as you can see attached.

"""DIRECTION GÉNÉRALE DE6 FNANCES PUBLIQUES\n\nAVIS D'IMPÔT 2017\nIMPÔT SUR LES REVENUS\nd Fannée 2016\n\n \n\nPour vos _ démarches,\npas  besoin doiginal —\nMc d furir un —\nphotocopie, vérifiable sur —\nTmpots gouv vn\n\nVotre situation\n\n \n\nVos rétérences.\n\nPour accéder à votre espace partculior MONTANT À PAYER\nNuméro fiscal | | A us ario 15/00/2017 (41)\n\nN* daccès en ligne voirvouo déciaration | | Détail du montant à payer\nRevenu fiscal d référence Montart de vtr impôt su e revors\n# | Rétéronce de 'avis <VRRRRS | Versemens sur 1er acompte\nVersomontssur 26 acompto\n\nNuméro F —\n\nNuméro de rôle 016 A\nDate c'étaissement 2m0762017|\nDate de mise en recouvrement 3vo7æ2017|\n\n \n\n \n\n \n\n3899,00 €\n3893006\n\n \n\n \n\nLa somme que vous davez payer est supérieure à 2 000 €\nLa loirend obligatoie le paiement de cette somme par un des moyens suivants, à votre choix :\n\nur impots.gouv.fr: payez en igne ou adhérez au prélèvement à léchéance en vous connectant à vore\nspaco pariclor, pislissoz-vous guider\n\npartéléphone, courrir où couriel pour adhérer au prélèvement à échéanco (aux coordonnéesindiquées\ndansle cadre - Vos démarches »\n\nPour 2018,vous pourrez achérerau prélèvement mensue\n\x0c"""
curious
  • 201
  • 1
  • 10