My files are normally over 8GB, so I used the solutions of Liza Daly's fast_iter. However, it's not finished after almost 2 days. I found through cProfile that xpath takes nearly half of time. How can I speed up it?
Here's an example of my pdml files:
<pdml>
<packet>
<proto name="geninfo">
<field name="timestamp" value="1"/>
</proto>
<proto name="frame">
<field name="b1"/>
</proto>
</packet>
</pdml>
Here's my solution:
from lxml import etree
context = etree.iterparse(infilename,tag='packet', huge_tree=True)
my_dict = fast_iter(context, process_packet)
def fast_iter(context, func, *args, **kwargs):
my_dict = {}
for event, elem in context:
geninfo = elem.xpath("proto[@name='geninfo']")
frame = elem.xpath("proto[@name='frame']")
if not geninfo or not frame:
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
continue
item = func(elem, *args, **kwargs)
elem.clear()
if not my_dict.has_key(item[0]):
my_dict[item[0]]=[]
my_dict[item[0]].append(item[1])
while elem.getprevious() is not None:
del elem.getparent()[0]
del context
return my_dict
def process_packet(elem):
temp_list = None
timestamp = unicode(elem.xpath("proto[@name='geninfo']/field[@name='timestamp']/@value")[0])
...
temp_list = [timestamp, ...]
return temp_list