How this project works:
- Searches external docx / OCR data for a keyword
- Builds a context of 100 words surrounding the keyword
- Builds a docx to store the passage with a hyperlink posted under each completed search
What is missing: A way to link to the passage to its source from the external document in Word, so you can just use a hyperlink to it, but the problem is the OCR docx files read have no headings to bookmark a run, and I could not create them with long OCR, so it is not manageable from the aspect of going in to the docx file one by one reading gibberish at times.
So Word needs to be able to store the solution in the document where the passage is printed in the new file. This hyperlink code works... I need something more than what I have here to find the passage locations on its source, unless MS Word will not support such a specific function as finding the indexed word position of the passage? Can I build a macro and call it in python to make a link and run its position using the index?
Hyperlinking/bookmark code post ref:
def add_hyperlink(paragraph, text, url):
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element and a new w:rPr element
new_run = docx.oxml.shared.OxmlElement('w:r')
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Join all the xml elements together add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
# Create a new Run object and add the hyperlink into it
r = paragraph.add_run()
r._r.append(hyperlink)
# A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
# Delete this if using a template that has the hyperlink style in it
r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
r.font.underline = True
return hyperlink
def extract_surround_words(text, keyword, n):
'''
text : input text
keyword : the search keyword we are looking
n : number of words around the keyword
'''
# extracting all the words from text
words = re.findall(r'\w+', text)
passage = []
passageText = ''
saveIndex = []
passagePos = []
indexVal = ''
document = Document()
document.add_heading("The keyword searched is: " + searchKeyword + ", WORD COUNT: " + str(len(text)) + "\n", 0)
# iterate through all the words
for index, word in enumerate(words):
# check if search keyword matches
if word == keyword and len(words) > 0:
saveIndex.append(str(index-n))
# fetch left side words and right
passage = words[index - n: index] #start text run
passage.append(keyword)
passage += words[index + 1: index + n + 1] #end of run
passagePos = "\nWORD COUNT POSITION: " + str(saveIndex.pop() + "\n")
bookmark = add_bookmark(index, passagePos)
print(str(passagePos))
for wd in passage:
passageText += ' ' + wd
parag = document.add_paragraph(passageText)
add_hyperlink(parag, passagePos, os.path.join(path, file))
passage.append("\n\n")
document.save(os.path.join(output_path, out_file_doc))
return passageText