0

How could you optimise this MapRduce Job (mrjob):

Using this script now, any idea how to optimse? I am using a lookahead to search for the ur=www.domain.de and then mapping and counting the r2 occurneces.

from mrjob.job import MRJob
from mrjob.step import MRStep

import re

LOOK_AHEAD = re.compile(r"(?=.*?(?:^|&)ur=www\.domain\.com(?:&|$)).*?(?:^|&)r2=([^&]+)")

class MRReferralAnalysis(MRJob):

    def mapper(self, _, line):

        for group in LOOK_AHEAD.findall(line):


            print (group)
            yield (group, 1)

    def reducer(self, itemOfInterest, counts):


        yield (sum(counts), itemOfInterest)


    def steps(self):
        return [
            MRStep( mapper=self.mapper,
                    reducer=self.reducer)
        ]

if __name__ == '__main__':
    MRReferralAnalysis.run()
Stephan Kristyn
  • 15,015
  • 14
  • 88
  • 147

0 Answers0