1

I am currently new to Hadoop. So I have this solved piece of code in MapReduce which finds out the "parts of a country with most 'Data Engineer' jobs for each year" (for example, if the data of the format (Year,Region,Count(Jobs)) is "2016,'XYZ',35" and "2016,'ABC',25" and "2015,'sdf',14", the answer would be "2016,'XYZ',35" and "2015,'sdf',14"), but I am unable to understand the part in the reducer which is as follows:-

    if (Top5DataEngineer.size() > 1)
            Top5DataEngineer.remove(Top5DataEngineer.firstKey());
    }//Ignore this bracket for the time being.

    protected void cleanup(Context context) throws IOException,
            InterruptedException {
        for (Text t : Top5DataEngineer.descendingMap().values())
            context.write(NullWritable.get(), t);
    }

This is the full code:-

    import java.io.IOException;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.Partitioner;
    import java.util.TreeMap;
    import org.apache.hadoop.mapreduce.Reducer;

    public class Q_002a {
     public static class Q_002a_Mapper extends
        Mapper<LongWritable, Text, Text, LongWritable> {
    LongWritable one = new LongWritable(1);

    public void map(LongWritable key, Text values, Context context)
            throws IOException, InterruptedException {
        try {
            if (key.get() > 0)

            {

                String[] token = values.toString().split("\t");
                if (token[4].equals("DATA ENGINEER")) {
                    Text answer = new Text(token[8] + "\t" + token[7]);
                    context.write(answer, one);
                }
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println(e.getMessage());
        } catch (ArithmeticException e1) {
            System.out.println(e1.getMessage());

        }

    }

}

public static class Q_002a_Partitioner extends Partitioner<Text, LongWritable> {
    @Override
    public int getPartition(Text key, LongWritable value, int numReduceTasks) {
        String[] str = key.toString().split("\t");
        if (str[1].equals("2011"))
            return 0;
        if (str[1].equals("2012"))
            return 1;
        if (str[1].equals("2013"))
            return 2;
        if (str[1].equals("2014"))
            return 3;
        if (str[1].equals("2015"))
            return 4;
        if (str[1].equals("2016"))
            return 5;
        else
            return 6;
    }
}

public static class Q_002a_Reducer extends
        Reducer<Text, LongWritable, NullWritable, Text> {
    private TreeMap<LongWritable, Text> Top5DataEngineer = new TreeMap<LongWritable, Text>();
    long sum = 0;

    public void reduce(Text key, Iterable<LongWritable> values,
            Context context) throws IOException, InterruptedException {
        sum = 0;
        for (LongWritable val : values) {
            sum += val.get();
        }
        Top5DataEngineer.put(new LongWritable(sum), new Text(key + ","
                + sum));
        if (Top5DataEngineer.size() > 1)
            Top5DataEngineer.remove(Top5DataEngineer.firstKey());
    }

    protected void cleanup(Context context) throws IOException,
            InterruptedException {
        for (Text t : Top5DataEngineer.descendingMap().values())
            context.write(NullWritable.get(), t);
    }
}

public static void main(String args[]) throws IOException,
        InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "Top  5 Data Engineer in a worksite");

    job.setJarByClass(Q_002a.class);
    job.setMapperClass(Q_002a_Mapper.class);
    job.setPartitionerClass(Q_002a_Partitioner.class);
    job.setReducerClass(Q_002a_Reducer.class);

    job.setNumReduceTasks(6);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);

}
}

This is the output I am getting:-

enter image description here

EDIT:- I tried running the code inside the cleanup() method in the reduce() method, but it was not working as expected. It only ran fine when it was in the cleanup() method. Any help regarding this would be appreciated.

Anand Raina
  • 17
  • 1
  • 8

1 Answers1

4

cleanup() method will be called when processing stage is completed. And it will be called only once.

In your example reduce() method is "searching" for the biggest sum of Data engineers jobs by city in years partition. Top5DataEngineer TreeMap stores keys in sorted(ascending) order and on each iteration it simply deletes first key(smaller key) if it has more than one key. In other words after processing Iterable<LongWritable> values you will get a city with the biggest number of jobs in every 'years' partition.

When reducer phase is finished, cleanup() method simply writes a result of every processed partition(single/biggest kv-pair in Top5DataEngineer map). cleanup() method will be called once for every 'years' partition.

Hope it will help you.

S. O.
  • 179
  • 5