I have this XML file :
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
<Component>
<Custom/>
<ID>1</ID>
<LongDescription>
<html><html> <head> <style type="text/css"> <!-- .style9 { color: #ffff33; background-color: #ff00ff } .style8 { color: #990099; background-color: #66ffcc } .style7 { color: #0066cc; background-color: #ccffcc } .style6 { color: #009900; background-color: #ffffcc } .style11 { color: #000066; background-color: #ccffcc } .style5 { color: #cc0033; background-color: #99ff99 } .style10 { color: #99ff99; background-color: #00cccc } .style4 { color: #cc0033; background-color: #ccffff } .style3 { color: #0000dd; background-color: teal } .style2 { color: #0000cc; background-color: aqua } .style1 { color: blue; background-color: silver } .style0 { color: #000099; background-color: #ffffcc } --> </style> </head> <body> </body> </html> </html>
</LongDescription>
<Name>ip_bridge</Name>
</component>
I am reading this file using the library xml.etree.ElementTree as follows :
def getTokens(xml_string_file):
tokensList = []
tree = ET.parse(xml_string_file)
root = tree.getroot()
tokensList.append('<component>')
for child in root:
firstTag = '<' + child.tag + '>'
lastTag = '</' + child.tag + '>'
tokensList.append(firstTag)
if child.text == None:
tokensList.append('')
elif re.findall(r"\n", child.text, re.DOTALL):
tokensList = tokensList + extractTags(root=child)
else:
tokensList.append(child.text)
tokensList.append(lastTag)
tokensList.append('</component>')
return tokensList
with the function extractTags
def extractTags(root):
tokensList = []
for child in root:
firstTag = '<' + child.tag + '>'
lastTag = '</' + child.tag + '>'
tokensList.append(firstTag)
if child.text == None:
tokensList.append('')
elif re.findall(r"\n", child.text, re.DOTALL): #To extract the children of the children
tokensList = tokensList + extractTags(root=child)
else:
tokensList.append(child.text)
tokensList.append(lastTag)
return tokensList
I get as a result the tokens list ['<omponent>', '<custom>', '', '</custom>', '<ID>', '1', '</ID>', '<LongDescription>', '<html>', '</html>', '</LongDescription>', '<Name>', 'ip_bridge', '</Name>', '</component>']
I want to extract also what is between the html tags as one token (one text).