How could you optimise this MapRduce Job (mrjob):
Using this script now, any idea how to optimse? I am using a lookahead to search for the ur=www.domain.de and then mapping and counting the r2 occurneces.
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
LOOK_AHEAD = re.compile(r"(?=.*?(?:^|&)ur=www\.domain\.com(?:&|$)).*?(?:^|&)r2=([^&]+)")
class MRReferralAnalysis(MRJob):
def mapper(self, _, line):
for group in LOOK_AHEAD.findall(line):
print (group)
yield (group, 1)
def reducer(self, itemOfInterest, counts):
yield (sum(counts), itemOfInterest)
def steps(self):
return [
MRStep( mapper=self.mapper,
reducer=self.reducer)
]
if __name__ == '__main__':
MRReferralAnalysis.run()