2

My collection has records like:

{ "_id":"1", "field1":"foo","field2":"xyz", "field3":"something" ...}
{ "_id":"2", "field1":"bar","field2":"xyz", "field3":"something" ...}
{ "_id":"3", "field1":"foo","field2":"abc", "field3":"something" ...}
{ "_id":"4", "field1":"bar","field2":"lmn", "field3":"something" ...}

Before inserting a new record i need to check if a record with same field1 and field2 value already exists. And then discard the request, if it already exists. I can manage to do this if i was inserting one record at a time. How do i handle this if im doing a bulk insert (i.e. when im inserting array of documents)?

I have array of [field1, field2] combinations which i need to look up EX:

queryArray=[ { "field1":"foo","field2":"xyz"},
             { "field1":"bar","field2":"lmn"} ]

Expected result:

result=[  { "_id":"1", "field1":"foo","field2":"xyz", "field3":"something" ...},
          { "_id":"4", "field1":"bar","field2":"lmn", "field3":"something" ...}]
marc_s
  • 732,580
  • 175
  • 1,330
  • 1,459
Kavya Mugali
  • 1,008
  • 2
  • 10
  • 17
  • one way to do it is to create unique index on {filed1:1, field2:1} which ensures that the field1 and filed2 are unique while inserting the docs – Atish May 24 '16 at 11:39

2 Answers2

5

Create a unique compound index on both fields

db.collection.createIndex( { "field1": 1, "field2": 1 }, { "unique": true } )

Use the insertMany() method to do the bulk insert but set the ordered option to false as this will ensure that all write operations are attempted, even if there are errors. Ordered operations stop after an error, while unordered operations continue to process any remaining write operations in the queue:

var queryArray = [ 
    { "field1": "foo", "field2": "xyz" },
    { "field1": "bar", "field2": "lmn" }
];
try { db.collection.insertMany(queryArray, { "ordered": false }); } 
catch (e) {  print (e); }

This will output a document

{
    "acknowledged" : true,
    "insertedIds" : [ 
        ObjectId("57443e6fa58e5654f3a6c5ae"), 
        ObjectId("57443e6fa58e5654f3a6c5af")
    ]
}

The resulting document shows the fields acknowledged as true if the operation ran with write concern or false if write concern was disabled and an array of _id for each successfully inserted documents.

Because the documents in queryArray did not include _id, mongod creates and adds the _id field for each document and assigns it a unique ObjectId value. And since you enforced uniqueness on the two fields field1 and field2, the above shows the attempted write as the operation was unordered thus it continued to process any remaining write operations.


Suppose you had removed the ordered option (by default it's set to true), you would then get the following output from the operation:

var queryArray = [ 
    { "field1": "foo", "field2": "xyz" },
    { "field1": "bar", "field2": "lmn" }
];
try { db.collection.insertMany(queryArray); } 
catch (e) {  print (e); }

Console Output:

{
    "name" : "BulkWriteError",
    "message" : "write error at item 0 in bulk operation",
    "ok" : undefined,
    "nInserted" : 0,
    "nUpserted" : 0,
    "nMatched" : 0,
    "nModified" : 0,
    "nRemoved" : 0,
    "getUpsertedIds" : function () {
      return bulkResult.upserted;
    },
    "getUpsertedIdAt" : function (index) {
      return bulkResult.upserted[index];
    },
    "getRawResponse" : function () {
      return bulkResult;
    },
    "hasWriteErrors" : function () {
      return bulkResult.writeErrors.length > 0;
    },
    "getWriteErrorCount" : function () {
      return bulkResult.writeErrors.length;
    },
    "getWriteErrorAt" : function (index) {
      if(index < bulkResult.writeErrors.length) {
        return bulkResult.writeErrors[index];
      }
      return null;
    },
    "getWriteErrors" : function () {
      return bulkResult.writeErrors;
    },
    "hasWriteConcernError" : function () {
      return bulkResult.writeConcernErrors.length > 0;
    },
    "getWriteConcernError" : function () {
      if(bulkResult.writeConcernErrors.length == 0) {
        return null;
      } else if(bulkResult.writeConcernErrors.length == 1) {
        // Return the error
        return bulkResult.writeConcernErrors[0];
      } else {

        // Combine the errors
        var errmsg = "";
        for(var i = 0; i < bulkResult.writeConcernErrors.length; i++) {
          var err = bulkResult.writeConcernErrors[i];
          errmsg = errmsg + err.errmsg;
          // TODO: Something better
          if (i != bulkResult.writeConcernErrors.length - 1) {
            errmsg = errmsg + " and ";
          }
        }

        return new WriteConcernError({ errmsg : errmsg, code : WRITE_CONCERN_FAILED });
      }
    },
    "tojson" : function (indent, nolint) {
      return tojson(bulkResult, indent, nolint);
    },
    "toString" : function () {
      return "BulkWriteError(" + this.tojson() + ")";
    },
    "shellPrint" : function () {
      return this.toString();
    },
    "hasErrors" : function () {
      return this.hasWriteErrors() || this.hasWriteConcernError();
    },
    "toSingleResult" : function () {
      if(singleBatchType == null) throw Error(
          "Cannot output single WriteResult from multiple batch result");
      return new WriteResult(bulkResult, singleBatchType, writeConcern);
    },
    "stack" : "BulkWriteError({\n\t\"writeErrors\" : [\n\t\t{\n\t\t\t\"index\" : 0,\n\t\t\t\"code\" : 11000,\n\t\t\t\"errmsg\" : \"E11000 duplicate key error index: test.collection.$field1_1_field2_1 dup key: { : \\\"foo\\\", : \\\"xyz\\\" }\",\n\t\t\t\"op\" : {\n\t\t\t\t\"_id\" : ObjectId(\"574441aea58e5654f3a6c5b6\"),\n\t\t\t\t\"field1\" : \"foo\",\n\t\t\t\t\"field2\" : \"xyz\"\n\t\t\t}\n\t\t}\n\t],\n\t\"writeConcernErrors\" : [ ],\n\t\"nInserted\" : 0,\n\t\"nUpserted\" : 0,\n\t\"nMatched\" : 0,\n\t\"nModified\" : 0,\n\t\"nRemoved\" : 0,\n\t\"upserted\" : [ ]\n})\nBulkWriteError@src/mongo/shell/bulk_api.js:372:44\nBulkWriteResult/this.toError@src/mongo/shell/bulk_api.js:335:16\nBulk/this.execute@src/mongo/shell/bulk_api.js:1162:1\nDBCollection.prototype.insertMany@src/mongo/shell/crud_api.js:279:5\n@(shell):1:7\n",
    "toResult" : function () {
      return new BulkWriteResult(bulkResult, singleBatchType, writeConcern);
    }
}

With emphasis on the returned write error:

"E11000 duplicate key error index: test.collection.$field1_1_field2_1 dup key: { : \\\"foo\\\", : \\\"xyz\\\" }\"

Apart from the insertMany() method, you could also try the Bulk() API methods where in particular you need to call the initializeUnorderedBulkOp() method to do an unordered bulk insert after creating the unique compound index.

Consider the following example for the above case:

db.collection('collectionName', function(err, collection) {
    var bulk = collection.initializeUnorderedBulkOp();
    counter = 0;

    queryArray.forEach(function (doc){
        bulk.insert(doc);
        counter++;

        if (counter % 1000 == 0) {
            bulk.execute(function(err, result) {
                // you could do something with results, check for duplicate errors
                bulk = collection.initializeUnorderedBulkOp(); // re-initialise
            }); 
        }
    });

    // Clean-up remaining operations in the queue 
    if (counter % 1000 != 0 ) {     
        bulk.execute(function(err, result) {
            // you could do something with results, check for duplicate errors
            console.log(result);
        });
    }
});
marc_s
  • 732,580
  • 175
  • 1,330
  • 1,459
chridam
  • 100,957
  • 23
  • 236
  • 235
  • , Thanks for the detailed explanation. This solution works perfectly fine. I feel this is very neat and clean approach. Im using mongo-skin for my db interactions, and looks like they don't support insertMany feature. Unfortunately, I need to figure some other way around this now. – Kavya Mugali May 25 '16 at 09:11
  • @KavyaMugali I've added a workaround that uses the `Bulk()` API, which mongoskin supports as you just need to call it using the underlying collection object from mongoskin – chridam May 25 '16 at 10:04
1

Going through https://docs.mongodb.com/manual/reference/method/Bulk.find.update/#Bulk.find.update, looks like we can use the bulk operation methods. However, note that you still have to call bulk.find().upsert().updateOne(), for each doc in your query array because you want to find and update the corresponding document from your queryArray.

Still, using bulk might be better because you can execute the query in one go, after you prepare it, instead of executing each query one by one.

e.g. using MongoDB shell

> //verifying db is empty
> db.items.find()
>
> //creating my bulk update function
> var myUpdate = function (rec) { this.find(rec).upsert().updateOne( {$set: rec} ); return null; }
>
> //initializing docs array with 4 documents
> var docs =
... [
...     { "_id":"1", "field1":"foo","field2":"xyz", "field3":"something31"},
...     { "_id":"2", "field1":"bar","field2":"xyz", "field3":"something32"},
...     { "_id":"3", "field1":"foo","field2":"abc", "field3":"something33"},
...     { "_id":"4", "field1":"bar","field2":"lmn", "field3":"something34", "field4": "something44" }
... ]
>
> //initializing the bulk operation object
> var bulk = db.items.initializeUnorderedBulkOp();
>
> //checking current state of bulk object
> bulk
{ "nInsertOps" : 0, "nUpdateOps" : 0, "nRemoveOps" : 0, "nBatches" : 0 }
>
> //maping each doc in the docs array to my update function
> docs.map(myUpdate, bulk);
[ null, null, null, null ]
>
> //checking current state of bulk object
> bulk
{ "nInsertOps" : 0, "nUpdateOps" : 4, "nRemoveOps" : 0, "nBatches" : 1 }
>
>
> //now actually bulk updating the db
> bulk.execute();
BulkWriteResult({
        "writeErrors" : [ ],
        "writeConcernErrors" : [ ],
        "nInserted" : 0,
        "nUpserted" : 4,
        "nMatched" : 0,
        "nModified" : 0,
        "nRemoved" : 0,
        "upserted" : [
                {
                        "index" : 0,
                        "_id" : "1"
                },
                {
                        "index" : 1,
                        "_id" : "2"
                },
                {
                        "index" : 2,
                        "_id" : "3"
                },
                {
                        "index" : 3,
                        "_id" : "4"
                }
        ]
})
>
>
> //checking for newly insert docs
> db.items.find();
{ "_id" : "1", "field1" : "foo", "field2" : "xyz", "field3" : "something31" }
{ "_id" : "2", "field1" : "bar", "field2" : "xyz", "field3" : "something32" }
{ "_id" : "3", "field1" : "foo", "field2" : "abc", "field3" : "something33" }
{ "_id" : "4", "field1" : "bar", "field2" : "lmn", "field3" : "something34", "field4" : "something44" }
>
>
> //now preparing to upsert new docs (might be existing docs, per your example)
> var newDocs =
... [
...     { "field1":"foo","field2":"xyz"},
...     { "field1":"bar","field2":"lmn"}
... ]
>
>
> //initializing the bulk object
> var bulk = db.items.initializeUnorderedBulkOp();
>
> //mapping the myUpdate function to each new document in the newDocs array
> newDocs.map(myUpdate, bulk);
[ null, null ]
>
>
> //now actually bulk updating the db
> bulk.execute();
BulkWriteResult({
        "writeErrors" : [ ],
        "writeConcernErrors" : [ ],
        "nInserted" : 0,
        "nUpserted" : 0,
        "nMatched" : 2,
        "nModified" : 0,
        "nRemoved" : 0,
        "upserted" : [ ]
})
>
> //notice how the nMatched = 2, and nModified = 0
>
> //verifying that nothing changed in the db
> db.items.find( {$or: newDocs} )
{ "_id" : "1", "field1" : "foo", "field2" : "xyz", "field3" : "something31" }
{ "_id" : "4", "field1" : "bar", "field2" : "lmn", "field3" : "something34", "field4" : "something44" }
>
>
An Vad
  • 426
  • 3
  • 7