Create a link to a specific word count position such as bookmark in docx

Question

How this project works:

Searches external docx / OCR data for a keyword
Builds a context of 100 words surrounding the keyword
Builds a docx to store the passage with a hyperlink posted under each completed search

What is missing: A way to link to the passage to its source from the external document in Word, so you can just use a hyperlink to it, but the problem is the OCR docx files read have no headings to bookmark a run, and I could not create them with long OCR, so it is not manageable from the aspect of going in to the docx file one by one reading gibberish at times.

So Word needs to be able to store the solution in the document where the passage is printed in the new file. This hyperlink code works... I need something more than what I have here to find the passage locations on its source, unless MS Word will not support such a specific function as finding the indexed word position of the passage? Can I build a macro and call it in python to make a link and run its position using the index?

Hyperlinking/bookmark code post ref:

def add_hyperlink(paragraph, text, url):
    # This gets access to the document.xml.rels file and gets a new relation id value
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element and a new w:rPr element
    new_run = docx.oxml.shared.OxmlElement('w:r')
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # Join all the xml elements together add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    # Create a new Run object and add the hyperlink into it
    r = paragraph.add_run()
    r._r.append(hyperlink)

    # A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
    # Delete this if using a template that has the hyperlink style in it
    r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
    r.font.underline = True

    return hyperlink

def extract_surround_words(text, keyword, n):
'''
text : input text
keyword : the search keyword we are looking
n : number of words around the keyword
'''
# extracting all the words from text
words = re.findall(r'\w+', text)

passage = []
passageText = ''
saveIndex = []
passagePos = []
indexVal = ''

document = Document()
document.add_heading("The keyword searched is: " + searchKeyword + ", WORD COUNT: " + str(len(text)) + "\n", 0)

# iterate through all the words
for index, word in enumerate(words):
    # check if search keyword matches
    if word == keyword and len(words) > 0:
        saveIndex.append(str(index-n))
        # fetch left side words and right
        passage = words[index - n: index]  #start text run
        passage.append(keyword)
        passage += words[index + 1: index + n + 1]  #end of run
        passagePos = "\nWORD COUNT POSITION: " + str(saveIndex.pop() + "\n")
       
        bookmark = add_bookmark(index, passagePos)

        print(str(passagePos))

        for wd in passage:
            passageText += ' ' + wd

        parag = document.add_paragraph(passageText)

        add_hyperlink(parag, passagePos, os.path.join(path, file))
        passage.append("\n\n")
        document.save(os.path.join(output_path, out_file_doc))

return passageText

Create a link to a specific word count position such as bookmark in docx

0 Answers0