I'm trying to extract every single link from a PDF. I'm able to get every single hyperlink using this code:
folder = "test_folder"
folder_data = [os.path.join(dp, f) for dp, dn, filenames in os.walk(folder) for f in filenames if os.path.splitext(f)[1] == '.pdf']
data = [loc.replace("\\", "/") for loc in folder_data]
for loc in data:
doc = fitz.open(loc)
#color_check(doc, count)
file_name = loc.split("/")[-1]
print (f"INFO: Crawling over file {file_name}, number {count} of {len(data)}")
count += 1
for page in doc:
links = page.getLinks()
print(links)
for link in links:
uri_rect = []
uri_rect.append([round(link['from'][0], 2), round(link['from'][1], 2), round(link['from'][2], 2), round(link['from'][3], 2)])
words_in_document = page.getTextWords()
#print(links)
for word in words_in_document:
word_rect = []
word_rect.append([round(word[0], 2), round(word[1], 2), round(word[2], 2), round(word[3], 2)])
rect_dif_percentage = len(set(uri_rect[0])&set(word_rect[0])) / float(len(set(uri_rect[0]) | set(word_rect[0]))) * 100
if rect_dif_percentage >= 60:
#If link links to a file
try:
referenced_file_name = link['file'].split("/")[1]
referenced_file_path = link['file'].split("/")[0]
for file_loc in range(len(data)):
if referenced_file_name in data[file_loc]:
referenced_file_path = data[file_loc]
output.append([loc, word[4], referenced_file_path])
#If link links to a website
except:
referenced_file_name = "N/A"
referenced_file_path = link['uri']
output.append([loc, word[4], referenced_file_path])
with open("output.csv", "a", newline="") as f:
writer = csv.writer(f)
writer.writerows(output)
print("INFO: Crawling completed, you can close this window and check output.csv")
The issue is the following. If an hyperlink has more than one word, I won't be able to get the second word since I'm using the rectangle found in page.getLinks(), and this method only finds the first word of an hyperlink.
So for example, the following hyperlink: Click me!
My code would only be able to get the ''Click'' string.
What can I do to solve this problem? I'm stuck and I can't think of anything. Also, if you've another solution without using PyMuPDF, they're welcome!