I am trying the following code to extract the destination or uri of hyperlinks in PDF file via PyPDF2, but I encountered some encoded destinations. I tried to decompress them by filter.decodeStreamData()
; however, my result is still unreadable :(
I am not able to solve the problem, please help.
import PyPDF2
import zlib
from PyPDF2 import filters
PDFFile = open()
PDF = PyPDF2.PdfFileReader(PDFFile)
pages = PDF.getNumPages()
annotatn = '/Annots'
hyperlink = '/A'
ob = {'/URI', '/D'}
for page in range(pages):
print("Current Page: {}".format(page+1))
pageSliced = PDF.getPage(page)
pageObject = pageSliced.getObject()
if annotatn in pageObject.keys():
i = 0
for annot in pageObject[annotatn]:
i = i + 1
print("Link {}".format(i))
ann = annot.getObject()
if '/Dest' in ann.keys():
dest = ann['/Dest'][0].getObject()
data = dest['/Contents'].getObject()
s = data._data
contnt = filters.decodeStreamData(data)
if hyperlink in ann.keys():
links = ann[hyperlink]
for link in links:
if link in ob:
print(ann[hyperlink][link])