0

I am new on MapReduce and I have a very simple question. I solved WordCount problem and then I want to change the problem as Top N record on text. Although I sort all the words on text but I can not take last N value. First, I read text and send each word to reducer with 1 and then reducer find the number word for each different word. Then I tried to sort these word according to the occurrence of the word. But I can not find the Top N records

from mrjob.job import MRJob
from mrjob.step import MRStep
from stemming.porter2 import stem
class MRWordCount(MRJob):
     def steps(self):
            return [
                MRStep(mapper=self.mapper,
                    reducer=self.reducer),
                MRStep(mapper=self.secondmapper,
                    reducer = self.secondreducer)
                ]

    def mapper(self,_,lines): 
       words = lines.strip().split()
       for w in words:
            yield stem(w.lower()),1 

    def reducer(self, key, values): 
       yield key, (sum(values))

    def secondmapper(self, key,value):       
       yield '%04d'%int(value), key 

    def secondreducer(self, key, values):       
       for v in values:           
            yield v,key  

if __name__ == '__main__':
    MRWordCount.run()
John Vandenberg
  • 474
  • 6
  • 16
ugur
  • 400
  • 6
  • 20

1 Answers1

0

I solved the problem using following code

from mrjob.job import MRJob
from mrjob.step import MRStep
from stemming.porter2 import stem


class MRWordCount(MRJob):
    def steps(self):
            return [
                MRStep(mapper=self.mapper,
                    reducer=self.reducer),
                MRStep(reducer = self.secondreducer)
                ]

    def mapper(self,_,lines):
       words = lines.strip().split()
       for w in words:
            w = unicode(w, "utf-8", errors="ignore")
            yield stem(w.lower()),1 

    def reducer(self, key, values): 
       yield None, ('%04d'%int(sum(values)),key)

    def secondreducer(self, key, values):   
       self.aList= []    
       for v in values:
            self.aList.append(v)
       count = len(self.aList)
       for m in range(count-5,count):
          yield self.aList[m]


if __name__ == '__main__':
    MRWordCount.run()
ugur
  • 400
  • 6
  • 20