I am trying to extract images from PDF and got a code from StackOverflow
. It is working fine for some of the pdf but not for all. I saw a pattern that pdf which has a number of pages more than 8-10, it is not extracting anything.I think I am missing something minute here. Please help me figure out. This is the code I am using and here is the link to pdf resources
import PyPDF2
import sys
from PIL import Image
import os
import glob
from PyPDF2 import PdfFileReader
def ExtractImages(filename):
print("\n---------------------------------------")
print("This is the pdf processing",filename)
fileObject = PyPDF2.PdfFileReader(open(filename, "rb"))
print(fileObject)
pages = fileObject.getNumPages()
print("Total number of Pages is.....",pages)
for i in range(2,pages):
tempPage = fileObject.getPage(i)
if '/XObject' in tempPage['/Resources']:
xObject = tempPage['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if '/Filter' in xObject[obj]:
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
img = open(obj[1:] + ".tiff", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
else:
print("No image found for file.",filename)
listOfFiles = glob.glob('./*.pdf')
for file in listOfFiles:
ExtractImages(file)