I'm trying to read the multistream Wikipedia dump into a database. This is my attempt at loading smaller chunks in parallel. Here's the script:
#!/usr/bin/python3
import xml.sax
from bz2 import BZ2File
import mwparserfromhell
import psycopg2
import pathos
import os
import dill
class XmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def get_pages(self):
return self._pages
def get_page_count(self):
return len(self._pages)
def get_values(self):
return self._values
def characters(self, content):
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
if name in ('title', 'text', 'infobox'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self.process_article()
def process_article(self):
wikicode = mwparserfromhell.parse(self._values['text'])
infobox_array = wikicode.filter_templates(matches="infobox .*")
infobox = str(infobox_array[0]) if len(infobox_array) > 0 else ""
self._pages.append((self._values['title'], self._values['text'], infobox))
def load_xml(filename):
wiki_handler = XmlHandler()
wiki_parser = xml.sax.make_parser()
wiki_parser.setContentHandler(wiki_handler)
file = os.path.join("chunks", filename)
print("I'm a worker process")
cursor = conn.cursor()
with BZ2File(file, 'r') as f:
for line in f:
wiki_parser.feed(line)
pages = wiki_handler.get_pages()
for page in pages:
cursor.execute("INSERT INTO pages (title, text, infobox) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", page)
cursor.close()
print("all done")
if __name__ == "__main__":
conn = psycopg2.connect(dbname="wikipedia",
user="postgres",
password="postgres",
host="localhost",
port=5432)
file_list = [f for f in os.listdir("chunks") if os.path.isfile(os.path.join("chunks", f))]
pool = pathos.multiprocessing.ProcessingPool(processes=pathos.multiprocessing.cpu_count())
pool.map(load_xml, file_list)
And the traceback:
Traceback (most recent call last):
File "./loader_parallel.py", line 114, in <module>
pool.map(load_xml, file_list)
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 657, in get
raise self._value
multiprocess.pool.MaybeEncodingError: Error sending result:
'<multiprocess.pool.ExceptionWithTraceback object at 0x7f87ac0f4470>'.
Reason: 'TypeError("can't pickle pyexpat.xmlparser objects")'
Why can't the pyexpat.xmlparser object be pickled? How can I fix it? I tried testing it by running dill.copy(XmlHandler())
and it did so without error.
I installed pathos via pip3, running Python 3.7, on Debian 10. Pretty new to this, any help is appreciated, thanks in advance!