I am storing a small amount of data (few MBs) in a distributed cache and using that to perform anti join with two big files. For few lines of data in cache , the functionality is working fine, but when the cache is having more data in production it's not able to do the job but its not throwing any error as well. Just that only few records (around 20%) are getting joined and others are just getting ignored. So is there any upper limit of number of records that can be stored in the distributed cache? Why its working for some of the records and ignoring the rest? Any suggestion will be extremely helpful. Bellow is my code
public class MyMapper extends Mapper<LongWritable, Text, Text, TextPair> {
Text albumKey = new Text();
Text photoKey = new Text();
private HashSet<String> photoDeleted = new HashSet<String>();
private HashSet<String> albDeleted = new HashSet<String>();
Text interKey = new Text();
private TextPair interValue = new TextPair();
private static final Logger LOGGER = Logger.getLogger(SharedStreamsSlMapper.class);
protected void setup(Context context) throws IOException, InterruptedException {
int count=0;
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
System.out.println(cacheFiles.length);
LOGGER.info(cacheFiles+"****");
try {
if (cacheFiles != null && cacheFiles.length > 0) {
for (Path path : cacheFiles) {
String line;
String[] tokens;
BufferedReader joinReader = new BufferedReader(new FileReader(path.toString()));
System.out.println(path.toString());
// BufferedReader joinReader = new BufferedReader(new FileReader("/Users/Kunal_Basak/Desktop/ss_test/dsitCache/part-m-00000"));
try {
while ((line = joinReader.readLine()) != null) {
count++;
tokens = line.split(SSConstants.TAB, 2);
if(tokens.length<2){
System.out.println("WL");
continue;
}
if (tokens[0].equals("P")) {
photoDeleted.add(tokens[1]);
}
else if (tokens[0].equals("A")) {
albDeleted.add(tokens[1]);
}
}
}
finally {
joinReader.close();
}
}
}
}
catch (IOException e) {
System.out.println("Exception reading DistributedCache: " + e);
}
System.out.println(count);
System.out.println("albdeleted *****"+albDeleted.size());
System.out.println("photo deleted *****"+photoDeleted.size());
LOGGER.info("albdeleted *****"+albDeleted.size());
LOGGER.info("albdeleted *****"+albDeleted.size());
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try{
//my mapper code
}
}
}