This is my code:
import fitz
from PIL import Image
import pytesseract
# Open the PDF file using PyMuPDF
pdf_file = fitz.open("file")
# Iterate through all the pages in the PDF
text_list = []
for page_number in range(pdf_file.page_count):
page = pdf_file[page_number]
images = pdf_file.get_page_images(page_number)
# Iterate through all the images in the page
for img in images:
xref = img[0]
pix = fitz.Pixmap(pdf_file, xref)
# Check if the image is RGB or RGBA
if pix.n >= 3:
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
else:
image = Image.frombytes("L", [pix.width, pix.height], pix.samples)
text = pytesseract.image_to_string(image)
text_list.append(text)
df = pd.DataFrame(text_list)
print(df)
This is the error I get:
KeyError Traceback (most recent call last)
<ipython-input-103-a9f3c12268ea> in <module>
25 image = Image.frombytes("L", [pix.width, pix.height], pix.samples)
26
---> 27 text = pytesseract.image_to_string(image)
28 text_list.append(text)
29
5 frames
/usr/local/lib/python3.8/dist-packages/PIL/Image.py in save(self, fp, format, **params)
2121 """
2122
-> 2123 if resample is None:
2124 type_special = ";" in self.mode
2125 resample = Resampling.NEAREST if type_special else Resampling.BICUBIC
KeyError: 'PNG'
For more info, this is the table I'm trying to read (in the PDF file): https://drive.google.com/file/d/1pH93Ax4fsiPlq7KNtAAOpKuuGcoupHQ3/view?usp=share_link