import fitz # PyMuPDF library
# Load the PDF document
pdf_path = "./FAQ.pdf"
pdf_document = fitz.open(pdf_path)
# Initialize a dictionary to store text and hyperlink pairs
text_with_links = {}
# Iterate through the pages of the PDF document
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
# Extract text annotations from the page
text = page.get_text("text")
links = page.get_links()
for link in links:
if "uri" in link:
if text not in text_with_links:
text_with_links[text] = []
text_with_links[text].append(link["uri"])
# Create a new PDF with updated links next to text
new_pdf_path = "document.pdf"
new_pdf_document = fitz.open()
for text, links in text_with_links.items():
text_with_links_text = f"{text}\n\nLinks: {', '.join(links)}"
page = new_pdf_document.new_page()
page.insert_text((100, 100), text_with_links_text)
# Save the new PDF document
new_pdf_document.save(new_pdf_path)
new_pdf_document.close()
# Close the original PDF document
pdf_document.close()
print("New PDF with updated links has been created.")
What I want is Get in touch/("http://get_intouch.com/)Feedback ("http//feedback.com"/) the hyperlinks to be extracted and populated next to the word which has link , why I need this is because I am doing it as a hack to pass it to my model , please help me my code sucks to add link and whole pdf is distorted