Table extraction Using pdfPlumber

Question

Traceback (most recent call last):
  File "/Users/noelsjacob/Desktop/Projects/py_pdf_stm-master/TableExtractor.py", line 734, in <module>
    tables = pdf_interpreter.parse_page(1)
  File "/Users/noelsjacob/Desktop/Projects/py_pdf_stm-master/TableExtractor.py", line 641, in parse_page
    p_im = page.to_image(resolution=100)
  File "/Users/noelsjacob/anaconda3/lib/python3.10/site-packages/pdfplumber/page.py", line 431, in to_image
    return PageImage(self, resolution=resolution or DEFAULT_RESOLUTION)
  File "/Users/noelsjacob/anaconda3/lib/python3.10/site-packages/pdfplumber/display.py", line 95, in __init__
    self.original = get_page_image(
  File "/Users/noelsjacob/anaconda3/lib/python3.10/site-packages/pdfplumber/display.py", line 56, in get_page_image
    with WandImage(
  File "/Users/noelsjacob/anaconda3/lib/python3.10/site-packages/wand/image.py", line 9365, in __init__
    self.read(filename=filename)
  File "/Users/noelsjacob/anaconda3/lib/python3.10/site-packages/wand/image.py", line 10120, in read
    self.raise_exception()
  File "/Users/noelsjacob/anaconda3/lib/python3.10/site-packages/wand/resource.py", line 225, in raise_exception
    raise e
wand.exceptions.MissingDelegateError: no decode delegate for this image format `' @ error/constitute.c/ReadImage/746

I'm getting that error while running the following Python code:

def parse_page(self, page_n):
    if self.debug:
        print('Parsing page', page_n)
    page = self.pdf.pages[page_n]
    if self.debug:
        print('Rendering page')

    if self.debug:
        print('Finding tables')
    tables = TableFinder(page, {'snap_tolerance': 3, 'join_tolerance': 3})
    if self.debug:
        print('Found', len(tables.tables), 'tables')
    beaut_tables = []
    if self.draw:
        p_im = page.to_image(resolution=100)
        p_im.draw_lines(page.lines)
        p_im.save('page-{}-lines.png'.format(page_n + 1))
    if len(tables.tables) > 5:
        return []
    for n, table in enumerate(tables.tables):
        if self.draw:
            p_im.reset()
            im = Image.new('RGB', (page.width, page.height), (255,) * 3)
            canvas = ImageDraw.ImageDraw(im)
        ugly_table = table.extract()
        lines = []  # type: List[Line]
        cells = []  # type: List[Cell]
        for cell in tqdm(table.cells, desc='Parsing cells', unit='cells'):
            # p_im.draw_rect(cell)
            x1, y1, x2, y2 = cell
            p1 = Point(x1, y1)
            p1.right = True
            p1.down = True
            p2 = Point(x2, y1)
            p2.left = True
            p2.down = True
            p3 = Point(x2, y2)
            p3.up = True
            p3.left = True
            p4 = Point(x1, y2)
            p4.up = True
            p4.right = True
            line1 = Line(p1, p2)
            line2 = Line(p2, p3)
            line3 = Line(p3, p4)
            line4 = Line(p4, p1)
            lines.append(line1)
            lines.append(line2)
            lines.append(line3)
            lines.append(line4)
            cell = Cell(p1, p2, p3, p4)
            cells.append(cell)

        # for line in lines:
        #     p_im.draw_line(line.as_tuple)
        lines = self.filter_lines(lines)
        # for line in lines:
        #     line.draw(canvas, color='green')
        if self.draw:
            p_im.save('page-{}-{}_im.png'.format(page_n + 1, n))
            im.save('page-{}-{}.png'.format(page_n + 1, n))
        skeleton_points, skeleton = self.build_skeleton(lines.copy())
        if not skeleton_points:
            continue
        skeleton = self.skeleton_to_2d_table(skeleton)

        # for p in points:
        #     p.draw(canvas)

        beaut_table = Table(cells, skeleton, ugly_table, page.extract_words())
        beaut_table.build_table()
        if self.draw:
            for cell in beaut_table.cells:
                cell.draw(canvas)
        if self.debug:
            print('Saving rendered table')
        if self.draw:
            p_im.save('page-{}-{}_im.png'.format(page_n + 1, n))
            im.save('page-{}-{}.png'.format(page_n + 1, n))
        if self.draw:
            canvas.rectangle((0,0,page.width,page.height),fill='white') #cleaning canvas
            for row_id, row in enumerate(skeleton):
                for cell_id, cell in enumerate(row):
                    cell.text = '{}-{}'.format(row_id, cell_id)
                    cell.draw(canvas, color='green',text_color='red')
            im.save('page-{}-{}-skeleton.png'.format(page_n + 1, n))
        beaut_tables.append(beaut_table)

    return beaut_tables

I tried to install the delegate library for this I installed libpng and I have the latest version of the wand but still I'm facing this issue. Also, I have a proper installation of ImageMagick and all the necessary libraries.

The libraries I'm using are: xlsxwriter, pdfplumber, pdfminer.six, pyparsing, Pillow, certifi2018.8.13, chardet3.0.4, idna2.7, PyPDF30.0.4, requests2.20.0, tqdm4.25.0, urllib31.24.2

This code is used to convert the PDF into images and thus extract the merged columns and rows.

It is very difficult to answer your question without seeing any of your data nor any of the solution you have written which produces your problem. Please edit your question to show a minimal reproducible set consisting of sample input, expected output, actual output, and only the relevant code necessary to reproduce the problem. See [Minimal Reproducible Example](https://stackoverflow.com/help/minimal-reproducible-example "Minimal Reproducible Example") for details on how to best help us help you. — itprorh66, Jun 13 '23 at 12:47

Table extraction Using pdfPlumber

0 Answers0