Here is a hackish way to do this, using SAX. This would keep the contents inside your text nodes. It gets more complicated if you need to keep the tags and attributes inside those text nodes though.
from xml.sax import handler, make_parser
class CustomContentHandler(handler.ContentHandler):
def __init__(self):
handler.ContentHandler.__init__(self)
self.inside_text_tag = False
self.text_content = []
def startElement(self, name, attrs):
if name == 'text':
self.inside_text_tag = True
def endElement(self, name):
if name == 'text':
self.inside_text_tag = False
self.text = ''.join(self.text_content)
print "%s" % (self.text)
def characters(self, content):
if self.inside_text_tag:
self.text_content.append(content)
def parse_file(filename):
f = open(filename)
parser = make_parser()
ch = CustomContentHandler()
parser.setContentHandler(ch)
parser.parse(f)
f.close()
if __name__ == "__main__":
filename = "sample.xml"
parse_file(filename)
Used against the following sample.xml file:
<tag1>
<tag2>
<title>XML</title>
<text>
Text001
<h1>Header</h1>
Text002
<b>Text003</b>
</text>
</tag2>
</tag1>
would yield
Text001
Header
Text002
Text003