Word Merge in hadoop

Question

Currently i would like merge or concatenate two strings using hadoop. where The mapper function would group the words and the reduce will concatenate the values based on common key.

Below is my code for the map-reduce job.

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class mr2 {

  // mapper class
  public static class TokenizerMapper extends Mapper<Text, Text, Text, Text>{

    private Text word = new Text(); // key
    private Text value_of_key = new Text(); // value

    public void map(Text key, Text value, Context context) throws IOException, InterruptedException {

      String line = value.toString();
      String IndexAndCategory = "";
      String value_of_the_key = "";

      StringTokenizer itr = new StringTokenizer(line);

      // key creation
      IndexAndCategory += itr.nextToken() + " ";
      IndexAndCategory += itr.nextToken() + " ";

      // value creation
      value_of_the_key += itr.nextToken() + ":";
      value_of_the_key += itr.nextToken() + " ";

      // key and value
      word.set(IndexAndCategory);
      value_of_key.set(value_of_the_key);

      // write key-value pair
      context.write(word, (Text)value_of_key);
    }
  }

  // reducer class
  public static class IntSumReducer extends Reducer<Text,Text,Text,Text> {

    //private IntWritable result = new IntWritable();
      private Text values_of_key = new Text();

    @Override
    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

      String values_ = "";
      for (Text val : values) {
        values_ += val.toString();
      }
      values_of_key.set(values_);
      context.write(key, values_of_key);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "mr2");
    job.setJarByClass(mr2.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(1);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

The input to mapper is in the below format.

1 A this 2

1 A the 1

3 B is 1

The mapper process this into the below format and gives to reducer

1 A this:2

1 A the:1

3 B is:1

The reduce then reduces the given input into below format.

1 A this:2 the:1

3 B is:1

I used word count as basic template and modified it to process Text(String) but when i execute the above mentioned code i am getting the below error.

Error: java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.io.Text
    at mr2$TokenizerMapper.map(mr2.java:17)
    at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:784)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1614)
    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)

It is expecting LongIntWritable. Any help to solve this issue is appreciated.

Which line is line 17? Can you please include your driver class code? — rgettman, Feb 22 '16 at 18:09
@rgettman i have updated the code(complete code with mapper reducer driver) and line 17 is public static class TokenizerMapper extends Mapper — kiran6, Feb 22 '16 at 18:17

score 0 · Answer 1 · answered Feb 22 '16 at 19:25

If you're reading a text file, the mapper must be defined as

public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, Text>{

So the map method should look like this

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

score 0 · Answer 2 · edited May 23 '17 at 12:07

The problem was in the main function i was not specify what is the output of the mapper, so the reducer was expecting the default one as input. For more details refer the this post.

Changed input type to Object from Text. public static class TokenizerMapper extends Mapper{

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

Adding the following lines solved the issue.

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

The following is the complete working code.

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.LongWritable;

public class mr2 {

  // mapper class
  public static class TokenizerMapper extends Mapper<Object, Text, Text, Text>{

    private Text word = new Text(); // key
    private Text value_of_key = new Text(); // value

    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

      String line = value.toString();
      String IndexAndCategory = "";
      String value_of_the_key = "";

      StringTokenizer itr = new StringTokenizer(line);

      // key creation
      IndexAndCategory += itr.nextToken() + " ";
      IndexAndCategory += itr.nextToken() + " ";

      // value creation
      value_of_the_key += itr.nextToken() + ":";
      value_of_the_key += itr.nextToken() + " ";

      // key and value
      word.set(IndexAndCategory);
      value_of_key.set(value_of_the_key);

      // write key-value pair
      context.write(word, value_of_key);
    }
  }

  // reducer class
  public static class IntSumReducer extends Reducer<Text,Text,Text,Text> {

    //private IntWritable result = new IntWritable();
      private Text values_of_key = new Text();

    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

      String values_ = "";
      for (Text val : values) {
        values_ += val.toString();
      }
      values_of_key.set(values_);
      context.write(key, values_of_key);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "mr2");
    job.setJarByClass(mr2.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(1);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

Word Merge in hadoop

2 Answers2