I am new on MapReduce and I have a very simple question. I solved WordCount problem and then I want to change the problem as Top N record on text. Although I sort all the words on text but I can not take last N value. First, I read text and send each word to reducer with 1 and then reducer find the number word for each different word. Then I tried to sort these word according to the occurrence of the word. But I can not find the Top N records
from mrjob.job import MRJob
from mrjob.step import MRStep
from stemming.porter2 import stem
class MRWordCount(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper,
reducer=self.reducer),
MRStep(mapper=self.secondmapper,
reducer = self.secondreducer)
]
def mapper(self,_,lines):
words = lines.strip().split()
for w in words:
yield stem(w.lower()),1
def reducer(self, key, values):
yield key, (sum(values))
def secondmapper(self, key,value):
yield '%04d'%int(value), key
def secondreducer(self, key, values):
for v in values:
yield v,key
if __name__ == '__main__':
MRWordCount.run()