Reading archive file as distributed cache in mapper

Question

I am sending all the files in a directory to distributed cache. So what i have done so far is

    import java.net.URI;

    import org.apache.hadoop.conf.Configuration;
     import org.apache.hadoop.conf.Configured;
     import org.apache.hadoop.filecache.DistributedCache;
     import org.apache.hadoop.fs.Path;
     import org.apache.hadoop.io.Text;
     import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 public class dc_driver extends Configured implements Tool {

     @SuppressWarnings("deprecation")
     @Override
     public int run(String[] args) throws Exception {


         Configuration conf = new Configuration();

         DistributedCache.createSymlink(conf);
         URI archiveuri = new URI("/user/cloudera/dc_archive_input/dc.zip#dczip");
         DistributedCache.addCacheArchive(archiveuri, conf);
         Job job = new Job(conf);
         job.setJobName(this.getClass().getName());
         job.setJarByClass(dc_driver.class);
         job.setMapOutputKeyClass(Text.class);
         job.setMapOutputValueClass(Text.class);
         job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(Text.class);

         job.setMapperClass(dc_mapper.class);
         job.setInputFormatClass(TextInputFormat.class);
         job.setOutputFormatClass(TextOutputFormat.class);
         job.setNumReduceTasks(0);
         FileInputFormat.addInputPath(job, new Path(args[0]));
         FileOutputFormat.setOutputPath(job, new Path(args[1]));






         return job.waitForCompletion(true) ? 0 : 1;
     }

     public static void main(String[] args) throws Exception {
         int res = ToolRunner.run(new Configuration(), new dc_driver(), args);
         System.exit(res);
     }
 }




 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Mapper.Context;
 import org.apache.hadoop.util.StringUtils;


 public class dc_mapper extends Mapper<LongWritable, Text, Text,Text> {

 String s1;

 public void setup(Context context) throws IOException,InterruptedException {

 Configuration conf = context.getConfiguration();

 URL resource = conf.getResource("dczip");

 s1 = resource.toString();
  }

 public void map(LongWritable key, Text value,Context context)
 throws IOException, InterruptedException {

 String line = value.toString();

 context.write(new Text(s1), new Text(line));
 }


 }

I got the output as

file:/mapred/local/taskTracker/cloudera/jobcache/job_201402240544_0011/attempt_201402240544_0011_m_000000_0/work/dczip 10,pooja,bnglr file:/mapred/local/taskTracker/cloudera/jobcache/job_201402240544_0011/attempt_201402240544_0011_m_000000_0/work/dczip 40,rahul,hyb

How to read the contents to the file inside dc.zip

There is an example of usage here : http://hadoop.apache.org/docs/stable/api/org/apache/hadoop/filecache/DistributedCache.html — Venkat, Feb 24 '14 at 22:38
ok. I am able to read the files within folders but when i load data in a stringlist, i get output with some special characters. like the way wen u try to display a xls file in unix — Pooja3101, Feb 25 '14 at 18:40
Did you specify it as : DistributedCache.addCacheArchive()? Looks like you used : DistributedCache.addFileToClassPath(). Verify. — Venkat, Feb 25 '14 at 19:05
I found now that my file is not getting unarchived locally. What to do? Please help — Pooja3101, Feb 25 '14 at 20:24

Reading archive file as distributed cache in mapper

0 Answers0