My MapReduce has to read records from HBase and need to write into zip files. Our client has asked specifically that the reducer output files should be .zip
files only.
For this I have written the ZipFileOutputFormat
wrapper to compress the records and write into the zip files.
Also we can't use buffer and keep all lines into buffer and then iterate because some file contains 19GB of records and at that time it will throw a java.lang.OutOfMemoryError
.
All seems ok but there is one problem:
The .zip
file is getting created for each key.
Inside my output file I can see many output files and those are separates file per row key. I don't know how to combined it inside the zip file.
Here is my implementation of the ZipFileOutputFormat.java
public class ZipFileOutputFormat<K, V> extends FileOutputFormat<K, V> {
public static class ZipRecordWriter<K, V> extends org.apache.hadoop.mapreduce.RecordWriter<K, V> {
private ZipOutputStream zipOut;
public ZipRecordWriter(FSDataOutputStream fileOut) {
zipOut = new ZipOutputStream(fileOut);
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
zipOut.closeEntry();
zipOut.finish();
zipOut.close();
zipOut.flush();
}
@Override
public void write(K key, V value) throws IOException {
String fname = null;
if (key instanceof BytesWritable) {
BytesWritable bk = (BytesWritable) key;
fname = new String(bk.getBytes(), 0, bk.getLength());
} else {
fname = key.toString();
}
ZipEntry ze = new ZipEntry(fname);
zipOut.closeEntry();
zipOut.putNextEntry(ze);
if (value instanceof BytesWritable) {
zipOut.write(((BytesWritable) value).getBytes(), 0, ((BytesWritable) value).getLength());
} else {
zipOut.write(value.toString().getBytes());
}
}
}
//
// @Override
// public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf
// job,
// String name, Progressable progress) throws IOException {
// Path file = FileOutputFormat.getTaskOutputPath(job, name);
// FileSystem fs = file.getFileSystem(job);
// FSDataOutputStream fileOut = fs.create(file, progress);
// return new ZipRecordWriter<K, V>(fileOut);
// }
@Override
public org.apache.hadoop.mapreduce.RecordWriter<K, V> getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = job.getConfiguration();
getOutputCommitter(job);
getOutputName(job);
Path file = getDefaultWorkFile(job, ".zip");
// Path file = new Path(committer.getWorkPath()+"/"+fileName);
FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file);
return new ZipRecordWriter<K, V>(fileOut);
}
}