0

I am Converting a text file to Sequence file using MapReduce and back to Text. I am getting some numbers at the start of each line. How can i remove them or stop them from coming in my output.

e.g. Text :

d001    Marketing

d002    Finance

d003    Human Resources

Converted sequence file :

0   d001    Marketing

15  d002    Finance\n

28  d003    Human Resources

Converted text from sequence file

0   d001    Marketing

15  d002    Finance

28  d003    Human Resources

I want 0 15 28 values to be removed.

I am using following code :

public class FormatConverterTextToSequenceDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.printf("Two parameters are required for FormatConverterTextToSequenceDriver-<input dir> <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterTextToSequenceDriver.class);
    job.setJobName("Create Sequence File, from text file");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(FormatConverterMapper.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }
 -----------------------------------------------------------------
public class FormatConverterSequenceToTextDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out
          .printf("Two parameters need to be supplied - <input dir> and <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterSequenceToTextDriver.class);
    job.setJobName("Convert Sequence File and Output as Text");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(FormatConverterMapper.class);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }
 -----------------------------------------------------------------
public class FormatConverterMapper extends
    Mapper<LongWritable, Text, LongWritable, Text> {

  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.write(key, value);
  }
}

Any help is appreciated.

2 Answers2

0

When you convert back from sequence file to text, you don't want to add the long that you write. So just adjust your write method to:

 @Override
 public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.write(value, null);
  }

And the output should just be the value itself.

Thomas Jungblut
  • 20,854
  • 6
  • 68
  • 91
0

Writing down a working code for reference.

Consists of 4 Classes

SequenceFileGenDriver - Driver code that creates sequence file from text file, takes in 2 arguments, the source text file path and the destination sequence file path

SequenceFileGenMapper - Mapper that converts Text file to Sequence File.

TextFileGenDriver - Driver code that converts sequence file to text file, takes in 2 arguments, input sequence file path and output text file path

TextFileGenMapper - Mapper that converts sequence file to text file

    public class SequenceFileGenDriver {

            public static void main(String[] args) throws Exception {
                    Configuration conf = new Configuration();
                    Job job = new Job(conf);
                    job.setJarByClass(SequenceFileGenDriver.class);
                    job.setMapperClass(SequenceFileGenMapper.class);
                    job.setNumReduceTasks(0);
                    job.setInputFormatClass(TextInputFormat.class);
                    job.setOutputFormatClass(SequenceFileOutputFormat.class);
                    job.setOutputKeyClass(Text.class);
                    job.setOutputValueClass(NullWritable.class);
                    TextInputFormat.addInputPath(job, new Path(args[0]));
                    SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
                    job.waitForCompletion(true);
            }

    }

    public class SequenceFileGenMapper extends
                    Mapper<LongWritable, Text, Text, NullWritable> {
            private final static NullWritable nullWritable = NullWritable.get();
            public void map(LongWritable key, Text value, Context context)
                            throws IOException, InterruptedException {
                            context.write(value, nullWritable);
            }
    }


    public class TextFileGenDriver {

            public static void main(String[] args) throws Exception {
                    Configuration conf = new Configuration();
                    Job job = new Job(conf);
                    job.setJarByClass(TextFileGenDriver.class);
                    job.setMapperClass(TextFileGenMapper.class);
                    job.setInputFormatClass(SequenceFileInputFormat.class);
                    job.setOutputFormatClass(TextOutputFormat.class);
                    job.setOutputKeyClass(Text.class);
                    job.setOutputValueClass(NullWritable.class);
                    job.setNumReduceTasks(0);
                    SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
                    TextOutputFormat.setOutputPath(job, new Path(args[1]));
                    job.waitForCompletion(true);
            }

    }

public class TextFileGenMapper extends
             Mapper<Text, NullWritable, Text, NullWritable> {
        private final static NullWritable nullWritable = NullWritable.get();
            public void map(Text key, NullWritable value, Context context)
                           throws IOException, InterruptedException {
                       context.write(key, nullWritable);
        }
}
Arun A K
  • 2,205
  • 2
  • 27
  • 45