I'm trying to remove duplicate documents in MongoDB in a large collection according to the approach described here:
db.events.aggregate([
{ "$group": {
"_id": { "firstId": "$firstId", "secondId": "$secondId" },
"dups": { "$push": "$_id" },
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } }}
], {allowDiskUse:true, cursor:{ batchSize:100 } }).forEach(function(doc) {
doc.dups.shift();
db.events.remove({ "_id": {"$in": doc.dups }});
});
I.e. I want to remove events that has the same "firstId
- secondId
" combination. However after a while MongoDB responds with this error:
2016-11-30T14:13:57.403+0000 E QUERY [thread1] Error: getMore command failed: {
"ok" : 0,
"errmsg" : "BSONObj size: 17582686 (0x10C4A5E) is invalid. Size must be between 0 and 16793600(16MB)",
"code" : 10334
}
Is there anyway to get around this? I'm using MongoDB 3.2.6.