1

I'm trying to read the multistream Wikipedia dump into a database. This is my attempt at loading smaller chunks in parallel. Here's the script:

#!/usr/bin/python3
import xml.sax
from bz2 import BZ2File
import mwparserfromhell
import psycopg2
import pathos
import os
import dill


class XmlHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def get_pages(self):
        return self._pages

    def get_page_count(self):
        return len(self._pages)

    def get_values(self):
        return self._values

    def characters(self, content):
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text', 'infobox'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self.process_article()

    def process_article(self):
        wikicode = mwparserfromhell.parse(self._values['text'])
        infobox_array = wikicode.filter_templates(matches="infobox .*")
        infobox = str(infobox_array[0]) if len(infobox_array) > 0 else ""
        self._pages.append((self._values['title'], self._values['text'], infobox))


def load_xml(filename):
    wiki_handler = XmlHandler()
    wiki_parser = xml.sax.make_parser()
    wiki_parser.setContentHandler(wiki_handler)

    file = os.path.join("chunks", filename)
    print("I'm a worker process")
    cursor = conn.cursor()

    with BZ2File(file, 'r') as f:
        for line in f:
            wiki_parser.feed(line)

        pages = wiki_handler.get_pages()

    for page in pages:
        cursor.execute("INSERT INTO pages (title, text, infobox) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", page)

    cursor.close()
    print("all done")


if __name__ == "__main__":
    conn = psycopg2.connect(dbname="wikipedia",
                            user="postgres",
                            password="postgres",
                            host="localhost",
                            port=5432)

    file_list = [f for f in os.listdir("chunks") if os.path.isfile(os.path.join("chunks", f))]
    pool = pathos.multiprocessing.ProcessingPool(processes=pathos.multiprocessing.cpu_count())
    pool.map(load_xml, file_list)

And the traceback:

Traceback (most recent call last):
  File "./loader_parallel.py", line 114, in <module>
    pool.map(load_xml, file_list)
  File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 268, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 657, in get
    raise self._value
multiprocess.pool.MaybeEncodingError: Error sending result: 
'<multiprocess.pool.ExceptionWithTraceback object at 0x7f87ac0f4470>'. 
Reason: 'TypeError("can't pickle pyexpat.xmlparser objects")'

Why can't the pyexpat.xmlparser object be pickled? How can I fix it? I tried testing it by running dill.copy(XmlHandler()) and it did so without error.

I installed pathos via pip3, running Python 3.7, on Debian 10. Pretty new to this, any help is appreciated, thanks in advance!

local_oaf
  • 31
  • 1
  • 4

0 Answers0