I am very new to using the Hadoop platform and defining MapReduce functions, and I am having a difficult time trying to understand why this mapper is not working in my MapReduce script. I am trying to parse a collection of pages written as a string in a .txt file where each "line" represents <page>...</page>
. What is incorrect about this script? Thank you for the help!
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.compat import jobconf_from_env
import lxml
import mwparserfromhell
import heapq
import re
class MRParser(MRJob):
def mapper(self, _, line):
bigString = ''.join(re.findall(r'(<text xml:space="preserve">.*</text>)',line))
root = etree.fromstring(bigString.decode('utf-8'))
if not(bigString == ''):
bigString = etree.tostring(root,method='text', encoding = "UTF-8")
wikicode = mwparserfromhell.parse(bigString)
bigString = wikicode.strip_code()
yield None, bigString
def steps(self):
return [
MRStep(mapper=self.mapper)
]