I'm writing a code that reads two input files and calculates some statistics like average rating by country. I'm using mrjob library, because the idea is that I'm able to run this on hadoop. Below are samples from those input files.
Users file:
"1";"nyc, new york, usa";NULL
"2";"stockton, california, usa";"18"
"3";"moscow, yukon territory, russia";NULL
"4";"porto, v.n.gaia, portugal";"17"
"5";"farnborough, hants, united kingdom";NULL
Ratings file:
"276729";"0521795028";"6"
"276733";"2080674722";"0"
"276736";"3257224281";"8"
"276737";"0600570967";"6"
"276744";"038550120X";"7"
The first column in the both files is user ID, so I need to use the user IDs to combine the data somehow.
Below is the code that I have written
#!/usr/bin/python3
from mrjob.job import MRJob
from mrjob.protocol import TextValueProtocol
import os
class BookRatings(MRJob):
INPUT_PROTOCOL = TextValueProtocol
OUTPUT_PROTOCOL = TextValueProtocol
# This mapper function yields key value pairs
# user_id, country or user_id, rating
def mapper(self, _, line):
if os.getenv('mapreduce_map_input_file') == "file://Users.csv":
# I'm using try because we might find some lines that are
# badly formed, which causes IndexError
try:
# If we are reading the User file, we separate user_id and country
csv_separation = line.split(";")
user_id = csv_separation[0].strip('"')
country = csv_separation[1].split(",")[2].strip().replace('"', '')
yield (user_id, country)
except IndexError:
pass
else:
try:
csv_separation = line.split(";")
user_id = csv_separation[0].strip('"')
rating = csv_separation[2].strip('"')
yield (user_id, rating)
except IndexError:
pass
def combiner(self, user_id, value):
# I could only come up with this method to build a dictionary
# which I could use in reducer to calculate statistics
user_country_mappings = {}
user_rating_mappings = {}
for v in value:
# So i'm checking if value is a number, because I want to create
# two dictionaries, one in which there's a country under user_id key
# and another in which there's a list of ratings under user_id key
if v.isdigit():
try:
user_rating_mappings[user_id.strip()].append(v)
except KeyError:
user_rating_mappings[user_id.strip()] = [v]
else:
user_country_mappings[user_id.strip()] = v
# last thing in combiner I want to have only one dictionary, which has
# country names as keys, and each key's value is a list which contains
# all ratings given by a user from that country
ratings_by_country = {}
# So, I loop over dictionary that contains user_id, ratings value pairs
for k,v in user_rating_mappings.items():
# I want every individual value from rating list
for rating in v:
# I add each rating to my new dictionary, and I get the country name
# from user_country_mappings dictionary, using the same user_id as key
try:
ratings_by_country[user_country_mappings[k]].append(v)
except KeyError:
ratings_by_country[user_country_mappings[k]] = [v]
# Then finally I yield this ratings_by_country dictionary
yield (None, ratings_by_country)
# Reducer should yield the final result
def reducer(self, _, mappings):
# Now at this point though I'm just trying to yield
# every key value, just to test that everything has worked so far
for value_map in mappings:
for k,v in value_map.items():
yield(None, k)
if __name__ == "__main__":
BookRatings.run()
So, the code gives the following error.
KeyError Traceback (most recent call last)
~/ratings.py in combiner(self, user_id, value)
62 try:
---> 63 ratings_by_country[user_country_mappings[k]].append(v)
64 except KeyError:
KeyError: '10'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~/ratings.py in <module>
143
144 if __name__ == "__main__":
--> 145 BookRatings.run()
/usr/local/lib/python3.6/dist-packages/mrjob/job.py in run(cls)
444 # load options from the command line
445 mr_job = cls(args=_READ_ARGS_FROM_SYS_ARGV)
--> 446 mr_job.execute()
447
448 def execute(self):
/usr/local/lib/python3.6/dist-packages/mrjob/job.py in execute(self)
471
472 else:
--> 473 super(MRJob, self).execute()
474
475 def make_runner(self):
/usr/local/lib/python3.6/dist-packages/mrjob/launch.py in execute(self)
200 def execute(self):
201 # Launcher only runs jobs, doesn't do any Hadoop Streaming stuff
--> 202 self.run_job()
203
204 def make_runner(self):
/usr/local/lib/python3.6/dist-packages/mrjob/launch.py in run_job(self)
245 with self.make_runner() as runner:
246 try:
--> 247 runner.run()
248 except StepFailedException as e:
249 # no need for a runner stacktrace if step failed; runners will
/usr/local/lib/python3.6/dist-packages/mrjob/runner.py in run(self)
515 self._add_input_files_for_upload()
516 self._create_input_manifest_if_needed()
--> 517 self._run()
518 self._ran_job = True
519
/usr/local/lib/python3.6/dist-packages/mrjob/sim.py in _run(self)
158 self._counters.append({})
159
--> 160 self._run_step(step, step_num)
161
162 def _run_step(self, step, step_num):
/usr/local/lib/python3.6/dist-packages/mrjob/sim.py in _run_step(self, step, step_num)
167 self._run_step_on_spark(step, step_num)
168 else:
--> 169 self._run_streaming_step(step, step_num)
170
171 def _run_streaming_step(self, step, step_num):
/usr/local/lib/python3.6/dist-packages/mrjob/sim.py in _run_streaming_step(self, step, step_num)
178 self._input_paths_for_step(step_num), step_num)
179
--> 180 self._run_mappers_and_combiners(step_num, map_splits)
181
182 if 'reducer' in step:
/usr/local/lib/python3.6/dist-packages/mrjob/sim.py in _run_mappers_and_combiners(self, step_num, map_splits)
219 self._run_mapper_and_combiner_func(
220 step_num, task_num, map_split)
--> 221 for task_num, map_split in enumerate(map_splits)
222 )
223 finally:
/usr/local/lib/python3.6/dist-packages/mrjob/sim.py in _run_multiple(self, funcs, num_processes)
127 """
128 for func in funcs:
--> 129 func()
130
131 def _log_cause_of_error(self, ex):
/usr/local/lib/python3.6/dist-packages/mrjob/sim.py in _run_mapper_and_combiner(run_mapper, sort_input, run_combiner, mapper_input_path, mapper_output_path, combiner_input_path)
727 if run_combiner:
728 sort_input([mapper_output_path], combiner_input_path)
--> 729 run_combiner()
730
731
/usr/local/lib/python3.6/dist-packages/mrjob/sim.py in _run_task(invoke_task, task_type, step_num, task_num, input_path, output_path, stderr_path, wd, env)
744
745 invoke_task(
--> 746 stdin, stdout, stderr, wd, env)
747
748
/usr/local/lib/python3.6/dist-packages/mrjob/inline.py in invoke_task(stdin, stdout, stderr, wd, env)
130
131 task = self._mrjob_cls(args)
--> 132 task.execute()
133 except:
134 # so users can figure out where the exception came from;
/usr/local/lib/python3.6/dist-packages/mrjob/job.py in execute(self)
462
463 elif self.options.run_combiner:
--> 464 self.run_combiner(self.options.step_num)
465
466 elif self.options.run_reducer:
/usr/local/lib/python3.6/dist-packages/mrjob/job.py in run_combiner(self, step_num)
555 read_lines, write_line = self._wrap_protocols(step_num, 'combiner')
556
--> 557 for k, v in self.combine_pairs(read_lines(), step_num=step_num):
558 write_line(k, v)
559
/usr/local/lib/python3.6/dist-packages/mrjob/job.py in combine_pairs(self, pairs, step_num)
625 .. versionadded:: 0.6.7
626 """
--> 627 for k, v in self._combine_or_reduce_pairs(pairs, 'combiner', step_num):
628 yield k, v
629
/usr/local/lib/python3.6/dist-packages/mrjob/job.py in _combine_or_reduce_pairs(self, pairs, mrc, step_num)
664 for key, pairs_for_key in itertools.groupby(pairs, lambda k_v: k_v[0]):
665 values = (value for _, value in pairs_for_key)
--> 666 for k, v in task(key, values) or ():
667 yield k, v
668
~/ratings.py in combiner(self, user_id, value)
63 ratings_by_country[user_country_mappings[k]].append(v)
64 except KeyError:
---> 65 ratings_by_country[user_country_mappings[k]] = [v]
66
67
KeyError: '10'
So, I studied the given error and figured out what triggers the exception. I tried what happens if I put that problem part in another try except block, and after that the code finished, but the dictionary was empty.
Then I did check what user_country_mappings and user_rating_mappings dictionaries looked like, and they looked exactly what I expected them to be. They both had user_ids as keys, and one dictionary had country name as a value and the other had a list of ratings. At this point I verified that they have common keys, for example that '10', and they both did have it. But somehow this combiner function raises KeyError, claiming that user_rating_mappings key is not in user_country_mappings. From the very beginning, I think I'm handling the values identically, so I can't see why KeyError is raised. I tried similar structure in Python interpreter on command line, and everything worked just fine.
At this point I'm clueless why the code behaves like it does. Any help is appreciated, thanks!