Iterparse big XML, with low memory footprint, and get all, even nested, Sequence Elements

Question

I have written a small python script to parse XML data based on Liza Daly's blog in Python. However, my code does not parse all the nodes. So for example when a person has had multiple addresses then it takes only the first available address. The XML tree would look like this:

- lgs
    - entities
        - entity
            - id
            - name
            - addressess
                - address
                    - address1
                - address
                    - address1
        - entity
            - id
              (...)

and this would be the python script:

import os
import time
from datetime import datetime
import lxml.etree as ET
import pandas as pd

xml_file = '.\\FILE.XML'

file_name, file_extension = os.path.splitext(os.path.basename(xml_file))

def fast_iter(context, *args, **kwargs):
# iterate through the XML file and create a dictionary 
    xml_dict = {  "Id":[]
    , "name":[]
    , "address":[]
    , "type":[] }

    def try_to_append(xml_column, node):
        # find and append XML nodes to the empty dictionary
        try:
            xml_dict[xml_column].append(elem.find(node).text)
        except:
            xml_dict[xml_column].append('')

    for event, elem in context:
        try_to_append('Id', 'id')
        try_to_append('name', 'name')
        try_to_append('address', 'addresses/address/address1'
        elem.clear()

        for ancestor in elem.xpath("ancestor-or-self::*"):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        del context
        return xml_dict

context = ET.iterparse(xml_file, tag='entity')
xml_dict = fast_iter(context)
df = pd.DataFrame(xml_dict)

So what I get is:

ID | name     | address        | Type
1  | John Doe | Pythonstreet 4 | A

But John Doe has 3 addresses on his name where he has lived. So I would expect:

ID | name     | address        | Type
1  | John Doe | Pythonstreet 4 | A
1  | John Doe | SQL street 33  | A
1  | John Doe | C++ street 99  | A

And as you can see, because the only extra node is address, the rest does not change and only the address is added extra. SQL Server would do this automatically when you use OPENROWSET() with BULK. However, this file is greater than 3GB so that wouldn't work.

UPDATE: XML Example (fake addresses)

<?xml version='1.0' encoding='UTF-8'?>
<ffl>
<version>12345</version>
<entities>

    <entity id="1124353" version="12345">
        <name>DAVID, Beckham</name>
        <footId>1021</footId>
        <footCode>FIF</footCode>
        <createdDate>09/02/1991</createdDate>

        <source>FIF</source>
        <OriginalSource>FIFA</OriginalSource>
        <pobs>
            <pob>Leytonstone, London, United Kingdom</pob>
        </pobs>
        <dobs>
            <dob>May 02, 1975</dob>
        </dobs>
        <titles>
            <title>Football player</title>
        </titles>
        <addresses>
            <address>
                <address></address>
                <city>London</city>
                <country>UK</country>
                <countryName>UNITED KINGDOM</countryName>
                <postalCode>SE10 0JF</postalCode>
            </address>
            <address>
                <address1>35-37 Parkgate Road</address1>
                <city>London</city>
                <country>UK</country>
                <countryName>UNITED KINGDOM</countryName>
                <postalCode>SW11 4NP</postalCode>
            </address>
        </addresses>
    </entity>

    <entity id="1184359" version="12345">
        <name>CRISTIANO, Ronaldo</name>
        <footId>1022</footId>
        <footCode>FIF</footCode>
        <createdDate>20/03/2003</createdDate>
        <source>FIF</source>
        <OriginalSource>FIFA</OriginalSource>
        <pobs>
            <pob>Funchal, Madeira, Portugal</pob>
        </pobs>
        <dobs>
            <dob>February 05, 1985</dob>
        </dobs>
        <titles>
            <title>Football player</title>
        </titles>
        <addresses>
            <address>
                <address>Avenida da Boavista 1837</address>
                <city>Porto</city>
                <country>PT</country>
                <countryName>PORTUGAL</countryName>
                <postalCode>4100-133</postalCode>
            </address>
            <address>
                <address1>Extramuros 74</address1>
                <city>Madrid</city>
                <country>ES</country>
                <countryName>Spain</countryName>
                <postalCode>28400</postalCode>
            </address>
            <address>
                <address1>Viale Certosa 29</address1>
                <city>Turin</city>
                <country>IT</country>
                <countryName>Italy</countryName>
                <postalCode>10093</postalCode>
            </address>
        </addresses>
    </entity>

    <entity id="1984359" version="12345">
        <name>LIONEL, Messi</name>
        <footId>1023</footId>
        <footCode>FIF</footCode>
        <createdDate>09/02/2008</createdDate>
        <source>FIF</source>
        <OriginalSource>FIFA</OriginalSource>
        <pobs>
            <pob>Rosario, Argentina</pob>
        </pobs>
        <dobs>
            <dob>June 24, 1987</dob>
        </dobs>
        <titles>
            <title>Football player</title>
        </titles>
        <addresses>
            <address>
                <address>Almeyra 2588</address>
                <city>San Martin</city>
                <state>Buenos Aires</state>
                <country>AR</country>
                <countryName>ARGENTINA</countryName>
                <postalCode>N/A</postalCode>
            </address>
            <address>
                <address1>Comandante Izarduy 67</address1>
                <city>Barcelona</city>
                <country>ES</country>
                <countryName>SPAIN</countryName>
                <postalCode>08940</postalCode>
            </address>
            <address>
                <address1>Humahuaca 4425</address1>
                <city>Buenos Aires</city>
                <country>AR</country>
                <countryName>ARGENTINA</countryName>
                <postalCode>N/A</postalCode>
            </address>
        </addresses>
    </entity>

</entities>

</ffl>

Instead of mixin `iterparse` with `.find(...` use `iterparse` like described here [element-iterparse.htm](http://effbot.org/zone/element-iterparse.htm). — stovfl, Dec 14 '18 at 10:20
I looked at it, but what would I need to change in my code then so that it finds all addresses? — mrPy, Dec 14 '18 at 10:50
*"what would I need to change"*: **Your whole approach**. You have to decide **1.** or **2.**: **1.** Get XML elements while parsing, then try the pattern descibed in [Incremental Parsing](http://effbot.org/zone/element-iterparse.htm#incremental-parsing). **2.** Search XML elements in the parsed XML Tree, then use `xpath/.findall/.find` etc. — stovfl, Dec 14 '18 at 11:00
Would be helpful if you could write some code, cause I can't figure it out.. — mrPy, Dec 17 '18 at 09:12
@stofvl, I added some XML data. Is this what you wanted? And is it possible that the entity ID is also parsed, as it proves more difficult, while you're at it.. Thank you for all the help so far! — mrPy, Dec 19 '18 at 14:03
*"entity ID"*: OK done. The XML data look a way more complicated. Did you realy want to **flatten** the records like in your *"John Doe"* exampel, means **X** `
` result in x **duplicated** records? As your target is a SQL DB why not using a relation using `entity id` as `PRIMARY KEY`? — stovfl, Dec 19 '18 at 15:49
Well it would be better for my purposes to have duplicated records based on a different address. But if you mean that you can put the duplicated records in a different table and I will be able to LEFT JOIN them, then that's okay too! — mrPy, Dec 21 '18 at 09:42

stovfl · Accepted Answer · 2018-12-21T14:00:18.307

Comment: As it now only outputs results

Outputing results are only for demonstration, tracing and debuging.
To write a record and addresses into a SQL database, for example using sqlite3, do:

c.execute("INSERT INTO entity(id, name) VALUES(:id, :name)", record)
addresses = []
for addr in record['addresses']:
    addr[1].update({'id': record['id']})
    addresses.append(addr[1])
c.executemany("INSERT INTO adresses(id, address, city) VALUES(:id, :address, :city)", addresses)

To flatten for pandas
Preconditon outside the loop: df = pd.DataFrame()

from copy import copy

addresses = copy(record['addresses'])
del record['addresses']

df_records = []
for addr in addresses:
    record.update(addr[1])
    df_records.append(record)

df = df.append(df_records, ignore_index=True)

Question: Use etree.iterparse to include all nodes in XML file

The following class Entity do:

Parse the XML File using lxml.etree.iterparse.
There is no File size limit, as the <entity>...</entity> Element Tree are deleted after processing.
Builds from every <entity>...</entity> Tree a dict {tag, value, ...}.
Using of generator objects to yield the dict.
Sequence Elements, e.g. <addresses>/<address> are List of Tuple [(address, {tag, text})....

ToDo:

To flatten into many Records, loop record['addresses']

To equal different tag names: address and address1

To flatten, Sequence tags, e.g. <titels>, <probs> and <dobs>

from lxml import etree

class Entity:
    def __init__(self, fh):
        """
        Initialize 'iterparse' to only generate 'end' events on tag '<entity>'

        :param fh: File Handle from the XML File to parse
        """
        self.context = etree.iterparse(fh, events=("end",), tag=['entity'])

    def _parse(self):
        """
        Parse the XML File for all '<entity>...</entity>' Elements
        Clear/Delete the Element Tree after processing

        :return: Yield the current '<entity>...</entity>' Element Tree
        """
        for event, elem in self.context:
            yield elem

            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]

    def sequence(self, elements):
        """
        Expand a Sequence Element, e.g. <titels> to a Tuple ('titel', text).
        If found a nested Sequence Element, e.g. <address>,
          to a Tuple ('address', {tag, text})

        :param elements: The Sequence Element
        :return: List of Tuple [(tag1, value), (tag2, value), ... ,(tagn, value))
        """
        _elements = []
        for elem in elements:
            if len(elem):
                _elements.append((elem.tag, dict(self.sequence(elem))))
            else:
                _elements.append((elem.tag, elem.text))

        return _elements

    def __iter__(self):
        """
        Iterate all '<entity>...</entity>' Element Trees yielded from self._parse()

        :return: Dict var 'entity' {tag1, value, tag2, value, ... ,tagn, value}}
        """
        for xml_entity in self._parse():
            entity = {'id': xml_entity.attrib['id']}

            for elem in xml_entity:
                # if elem is Sequence
                if len(elem):
                    # Append tuple(tag, value)
                    entity[elem.tag] = self.sequence(elem)
                else:
                    entity[elem.tag] = elem.text

            yield entity

if __name__ == "__main__":
    with open('.\\FILE.XML', 'rb') as in_xml_
        for record in Entity(in_xml):
            print("record:{}".format(record))

            for key, value in record.items():
                if isinstance(value, (list)):
                    #print_list(key, value)
                    print("{}:{}".format(key, value))
                else:
                    print("{}:{}".format(key, value))

Output: Shows only the first Record and only 4 fields.
Note: There is a pitfall with unique tag names: address and address1

record:{'id': '1124353', 'titles': {'title': 'Foot... (omitted for brevity)
id:1124353
name:DAVID, Beckham
titles:[('title', 'Football player')]
addresses:
    address:{'city': 'London', 'address': None, 'post... (omitted for brevity)
    address:{'city': 'London', 'address1': '35-37 Par... (omitted for brevity)

Tested with Python: 3.5 - lxml.etree: 3.7.1

First of all thank you stovfl! Secondly, how does this return a dataframe that can be used in further analysis? As it now only outputs results from the iteration right? — mrPy, Dec 21 '18 at 11:36
Thanks a lot. I think this is going to help a lot of other people too! — mrPy, Dec 21 '18 at 14:13

Iterparse big XML, with low memory footprint, and get all, even nested, Sequence Elements

UPDATE: XML Example (fake addresses)

1 Answers1

Linked