Purely from an aggregation framework point of view there are a few approaches to this.
You can either just apply $setUnion
in modern releases:
db.collection.aggregate([
{ "$project": {
"foo_list": { "$setUnion": [ "$foo_list", "$foo_list" ] }
}}
])
Or more traditionally with $unwind
and $addToSet
:
db.collection.aggregate([
{ "$unwind": "$foo_list" },
{ "$group": {
"_id": "$_id",
"foo_list": { "$addToSet": "$foo_list" }
}}
])
Or if you were just interested in the duplicates only then by general grouping:
db.collection.aggregate([
{ "$unwind": "$foo_list" },
{ "$group": {
"_id": {
"_id": "$_id",
"foo_list": "$foo_list"
},
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$ne": 1 } } },
{ "$group": {
"_id": "$_id._id",
"foo_list": { "$push": "$_id.foo_list" }
}}
])
The last form could be useful to you if you actually want to "remove" the duplicates from your data with another update statement as it identifies the elements which are duplicates.
So in that last form the returned result from your sample data identifies the duplicate:
{
"_id" : ObjectId("53f5f7314ffa9b02cf01c076"),
"foo_list" : [
{
"id" : "98aa4987-d812-4aba-ac20-92d1079f87b2",
"name" : "Foo 1",
"slug" : "foo-1"
}
]
}
Where results are returned from your collection per document that contains duplicate entries in the array and which entries are duplicated. This is the information you need to update, and you loop the results as you need to specify the update information from the results in order to remove duplicates.
This is actually done with two update statements per document, as a simple $pull
operation would remove "both" items, which is not what you want:
var cursor = db.collection.aggregate([
{ "$unwind": "$foo_list" },
{ "$group": {
"_id": {
"_id": "$_id",
"foo_list": "$foo_list"
},
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$ne": 1 } } },
{ "$group": {
"_id": "$_id._id",
"foo_list": { "$push": "$_id.foo_list" }
}}
])
var batch = db.collection.initializeOrderedBulkOp();
var count = 0;
cursor.forEach(function(doc) {
doc.foo_list.forEach(function(dup) {
batch.find({ "_id": doc._id, "foo_list": { "$elemMatch": dup } }).updateOne({
"$unset": { "foo_list.$": "" }
});
batch.find({ "_id": doc._id }).updateOne({
"$pull": { "foo_list": null }
});
});
count++;
if ( count % 500 == 0 ) {
batch.execute();
batch = db.collection.initializeOrderedBulkOp();
}
});
if ( count % 500 != 0 ) {
batch.execute();
}
That's the modern MongoDB 2.6 and above way to do it, with a cursor result from aggregation and Bulk operations for updates. But the principles remain the same:
Identify the duplicates in documents
Loop the results to issue the updates to the affected documents
Use $unset
with the positional $
operator to set the "first" matched array element to null
Use $pull
to remove the null
entry from the array
So after processing the above operations your sample now looks like this:
{
"_id" : ObjectId("53f5f7314ffa9b02cf01c076"),
"foo_list" : [
{
"id" : "98aa4987-d812-4aba-ac20-92d1079f87b2",
"name" : "Foo 1",
"slug" : "foo-1"
},
{
"id" : "157569ec-abab-4bfb-b732-55e9c8f4a57d",
"name" : "Foo 3",
"slug" : "foo-3"
}
]
}
The duplicate is removed with the "duplicated" item still intact. That is how you process to identify and remove the duplicate data from your collection.