You can try to convert the docx file to xml first. Then use regex to capture the superscript and subscript value.
Here's an example
import re
import zipfile
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
def get_docx_xml(path):
"""Take the path of a docx file as argument, return the text in unicode."""
document = zipfile.ZipFile(path)
xml_content = document.read('word/document.xml')
document.close()
return xml_content
def get_superscript_subscript(xml_content):
"""Returns a dictionary with a value of list of superscipt and subscript."""
superscript = re.findall('<w:vertAlign w:val="superscript"\/><w:lang w:val="[\S\s]*?"\/><\/w:rPr><w:t>([\S]+)<\/w:t><\/w:r>[\s\S]*?<w:t xml:space="preserve">([\s]*[\S]*)[\s\S]*?<\/w:t><\/w:r>', xml_content)
subscript = re.findall('<w:vertAlign w:val="subscript"\/><w:lang w:val="[\S\s]*?"\/><\/w:rPr><w:t>([\S]+)<\/w:t><\/w:r>[\s\S]*?<w:t xml:space="preserve">([\s]*[\S]*)[\s\S]*?<\/w:t><\/w:r>', xml_content)
return {"superscript": superscript, "subscript": subscript}
if __name__ == '__main__':
xml_content = get_docx_xml(<docx_file_path>)
superscripts_subscripts = get_superscript_subscript(xml_content)
The output will be like this - a dictionary with a value of list of tuple items:
The first one is the superscript/subscript and the second one is the first word after.
{'subscript': [('28', ')'), ('28', ' score'), ('28', ' were'), ('28', ' sum'), ('28', ' and'), ('28', ' score'), ('28', ')')],
'superscript': [('28', ')'), ('28', ' score'), ('28', ' were'), ('28', ' sum'), ('28', ' and'), ('28', ' score'), ('28', ')')]}