Personally I would take advantage of the fact that the ObjectId values themselves are "monotonic" or therefore "ever increasing in value" which means that the "youngest" or "most recent" would come at the end of a naturally sorted list.
So rather than force the aggregation pipeline to do the sorting, the most logical and efficient thing to do is simply sort the list of unique _id
values returned per document as you process each response.
So basically working with the listing that you must have found:
Remove Duplicates from MongoDB
And is actually my answer ( and your the second person to reference this week, and yet no votes received for useful! Hmm! ), where it's just a simple .sort()
applied within the cursor iteration for the returned array:
Using the _id Value
var bulk = db.History.initializeOrderedBulkOp(),
count = 0;
// List "all" fields that make a document "unique" in the `_id`
// I am only listing some for example purposes to follow
db.History.aggregate([
{ "$group": {
"_id": "$sessionId",
"ids": { "$push": "$_id" }, // _id values are already unique, so $addToSet adds nothing
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } } }
],{ "allowDiskUse": true}).forEach(function(doc) {
doc.ids.sort().reverse(); // <-- this is the only real change
doc.ids.shift(); // remove first match, which is now youngest
bulk.find({ "_id": { "$in": doc.ids } }).remove(); // removes all $in list
count++;
// Execute 1 in 1000 and re-init
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.History.initializeOrderedBulkOp();
}
});
if ( count % 1000 != 0 )
bulk.execute();
Using a specific field
If you "really" are set on adding another date value on which to determine which is youngest then just add to the array in $push
first, then apply the client side sort function. Again just a really simple change:
var bulk = db.History.initializeOrderedBulkOp(),
count = 0;
// List "all" fields that make a document "unique" in the `_id`
// I am only listing some for example purposes to follow
db.History.aggregate([
{ "$group": {
"_id": "$sessionId",
"ids": { "$push": {
"_id": "$_id",
"created": "$timeCreated"
}},
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$gt": 1 } } }
],{ "allowDiskUse": true}).forEach(function(doc) {
doc.ids = doc.ids.sort(function(a,b) { // sort dates and just return _id
return a.created.valueOf() < a.created.valueOf()
}).map(function(el) { return el._id });
doc.ids.shift(); // remove first match, which is now youngest
bulk.find({ "_id": { "$in": doc.ids } }).remove(); // removes all $in list
count++;
// Execute 1 in 1000 and re-init
if ( count % 1000 == 0 ) {
bulk.execute();
bulk = db.History.initializeOrderedBulkOp();
}
});
if ( count % 1000 != 0 )
bulk.execute();
So it's a really simple process with no "real" alteration to the original process used to identify the duplicates and then remove all but one of them.
Always the best approach here to just let the server do the job of finding the duplicates, then client side when iterating the cursor you can then work out from the returned array which document is going to be kept and which ones you are going to remove.