Consider using the aggregation framework to identify the duplicate documents, get a list of the duplicate _ids
for the actors collection alongside the arrays of movie ids and issue remove and update commands with the ids array as the query.
For testing purposes, suppose you have the following data in your collections (with minimum test cases, for demonstration purposes of course):
db.movies.insert([
{
"_id" : ObjectId("5543e79e42063d2be5d2ea84"),
"id" : 1,
"genres" : []
},
{
"_id" : ObjectId("5543e79e42063d2be5d2ea85"),
"id" : 2,
"genres" : []
},
{
"_id" : ObjectId("5543e79e42063d2be5d2ea86"),
"id" : 3,
"genres" : []
}
]);
db.actors.insert([
{ id: 1, known_for: [ObjectId("5543e79e42063d2be5d2ea84")] },
{ id: 1, known_for: [ObjectId("5543e79e42063d2be5d2ea84")] },
{ id: 2, known_for: [ObjectId("5543e79e42063d2be5d2ea84"), ObjectId("5543e79e42063d2be5d2ea85")] },
{ id: 3, known_for: [ObjectId("5543e79e42063d2be5d2ea85"), ObjectId("5543e79e42063d2be5d2ea86")] }
]);
Now for the magical part. The aggregation pipeline groups the actors documents by id
, calculates the grouped count, creates two array fields which hold the actor _id
duplicates and the movies object ids. The pipeline outputs the results to a collection dupes that will be used later on to remove the duplicates:
db.actors.aggregate([
{
"$group": {
"_id": "$id",
"duplicates": { "$addToSet": "$_id" },
"movies": { "$addToSet": "$known_for"},
"count": { "$sum": 1 }
}
},
{
"$match": {
"count": { "$gt": 1 }
}
},
{
"$out": "dupes"
}
])
Querying the dupes collection will give the result:
/* 1 */
{
"_id" : 1.0000000000000000,
"duplicates" : [
ObjectId("5543fc8e42063d2be5d2eaa2"),
ObjectId("5543fc8e42063d2be5d2eaa1")
],
"movies" : [
[
ObjectId("5543e79e42063d2be5d2ea84")
]
],
"count" : 2
}
Now for the fun part. Use the dupes collection to then remove the dupes from the actors collection. As you have noticed from the dupes collection, the movies field is an array of arrays so you will need to flatten it and use the flattened array to then remove the movies and pull the orphaned movie references from the actors collection:
db.dupes.find({}).find({}).forEach( function (doc) {
var movie_dupes = [];
db.actors.remove({ "_id": { "$in": doc.duplicates } });
doc.movies.forEach( function (arr){
arr.forEach(function (id){
movie_dupes.push(id)
});
});
db.movies.remove({ "_id": { "$in": movie_dupes } });
db.actors.update({ "known_for": { "$in": movie_dupes } }, { "$pull": { "known_for": { "$in": movie_dupes } } }, { "multi": true });
});
Logs to console:
Removed 2 record(s) in 38ms
Removed 1 record(s) in 2ms
Updated 1 existing record(s) in 1ms
Now to verify whether our duplicates have been obliterated:
db.actors.find()
/* 1 */
{
"_id" : ObjectId("5543fc8e42063d2be5d2eaa3"),
"id" : 2,
"known_for" : [
ObjectId("5543e79e42063d2be5d2ea85")
]
}
/* 2 */
{
"_id" : ObjectId("5543fc8e42063d2be5d2eaa4"),
"id" : 3,
"known_for" : [
ObjectId("5543e79e42063d2be5d2ea85"),
ObjectId("5543e79e42063d2be5d2ea86")
]
}
Actor with id
1 (which was a duplicate) was indeed removed.
db.movies.find()
/* 1 */
{
"_id" : ObjectId("5543e79e42063d2be5d2ea85"),
"id" : 2,
"genres" : []
}
/* 2 */
{
"_id" : ObjectId("5543e79e42063d2be5d2ea86"),
"id" : 3,
"genres" : []
}
Movie with ObjectId("5543e79e42063d2be5d2ea84")
was removed.