This python3 program attempts to produce a frequency list of words from a text file using map/reduce. I would like to know how to order the word counts, represented as 'count' in the second reducer's yield statement so that the largest count values appear last. Currently, the tail of the results look like this:
"0002" "wouldn"
"0002" "wrap"
"0002" "x"
"0002" "xxx"
"0002" "young"
"0002" "zone"
For context, I pass any word text file into the python3 program like this:
python MapReduceWordFreqCounter.py book.txt
Here is the code for MapReduceWordFreqCounter.py
:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
# ignore whitespace characters
WORD_REGEXP = re.compile(r"[\w']+")
class MapReduceWordFreqCounter(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper_get_words,
reducer=self.reducer_count_words),
MRStep(mapper=self.mapper_make_counts_key,
reducer = self.reducer_output_words)
]
def mapper_get_words(self, _, line):
words = WORD_REGEXP.findall(line)
for word in words:
yield word.lower(), 1
def reducer_count_words(self, word, values):
yield word, sum(values)
def mapper_make_counts_key(self, word, count):
yield str(count).rjust(4,'0'), word
def reducer_output_words(self, count, words):
for word in words:
yield count, word
if __name__ == '__main__':
MapReduceWordFreqCounter.run()