I am trying to write a custom bulk upload script using the new mongo bulk apis. I am using UnorderedBulkOp
which works really fast initially, but after being called several times it begins to hang. I've tried using log lines and it seems that after the 10th call is when it really starts to blow up. If I stop the upload and restart it (there's code in place to check for dupes), the first several calls to execute
are performant again so it doesn't seem to be dependent on the amount of data in my collection. What is going on? I thought of pushing all operations to the bulk op and calling execute only once, but saw on another answer here to incrementally call execute
on the bulk op.
Stripped down a bit it is doing this:
this.db.collection(collection_name, function(err, collection){
var bulk = collection.initializeUnorderedBulkOp();
var operations = 0;
var dataread = fs.createReadStream(filepath, {encoding: 'utf8'});
var current = '';
// load and split data from CSV
dataread.on('data', function(data){
dataread.pause();
chunk = current + data;
var split = chunk.split('\n');
current = split.pop();
var ids = [];
for(i=0, len = split.length; i< len; i++){
lineData = split[i].split(',');
customid = parseInt(lineData[0]);
ids.push(customid);
}
// find which docs already exist and need to be updated
collection.find({customid: {$in: ids}}).toArray(function(err, docs){
var docmap = {};
for(i=0, len=docs.length; i<len; i++){
docmap[docs[i].customid] = docs[i];
}
for(isplit=0; isplit<split.length; isplit++){
lineData = split[isplit].split(',');
customid = parseInt(lineData[0]);
// check for insert or update
if(docmap[customid]){
doc = docmap[customid];
//update doc
bulk.find({_id: doc._id}).update({$push: {history: 1}});
else{
doc = formatData(lineData);
bulk.insert(doc);
}
operations++;
}
if(operations > 10000){
bulk.execute({w: 1}, function(err, result){
operations = 0;
dataread.resume();
});
}else{
dataread.resume();
}
});
});
});
Originally I was doing this using individual calls to collection.save
but my dataset is currently on the order of 2 million datapoints and I am looking to optimize as I will be running this upload once a week.