3

I use the following code to loop insert 1000000 documents to mongodb,but i found node process takes up a lot of memory,my client are dead.

db.collection("batch_insert", function (err, collection) {
    if (!err) {
        var count = 0;
        for (var i = 0; i < 1000000; i++) {
            collection.insert({hello:'world', ok:'OKOKOK'}, {safe:true, serializeFunctions:false}, function (err, result) {
                count++;
                if (1000000 == count) {
                    db.close();
                }
            });
        }
    } else {
        console.log(err);
    }
});
Steve
  • 281
  • 1
  • 3
  • 14

2 Answers2

8

Your for cycle blocks event loop. And it can't go to nextTick and handle query results until all queries sended to mongodb. You need to use asynchronous way to batch insert data. Something like this:

var mongo = require('mongodb');

var Inserter = function (collection) {
    this.collection = collection;
    this.data = [];
    this.maxThreads = 6;
    this.currentThreads = 0;
    this.batchSize = 5000;
    this.queue = 0;
    this.inserted = 0;
    this.startTime = Date.now();
};

Inserter.prototype.add = function(data) {
    this.data.push(data);
};

// Use force=true for last insert
Inserter.prototype.insert = function(force) {
    var that = this;
    if (this.data.length >= this.batchSize || force) {
        if (this.currentThreads >= this.maxThreads) {
            this.queue++;
            return;
        }
        this.currentThreads++;
        console.log('Threads: ' + this.currentThreads);
        this.collection.insert(this.data.splice(0, this.batchSize), {safe:true}, function() {
            that.inserted += that.batchSize;
            var currentTime = Date.now();
            var workTime = Math.round((currentTime - that.startTime) / 1000)
            console.log('Speed: ' + that.inserted / workTime + ' per sec');
            that.currentThreads--;
            if (that.queue > 0) {
                that.queue--;
                that.insert();
            }
        });
    }
};

var db = new mongo.Db('test', new mongo.Server('localhost', 27017, {}), {native_parser:false});
db.open(function(err, db) {
    db.collection('test', function(err, collection) {
        var inserter = new Inserter(collection);
        setInterval(function() {
            for (var i = 0; i < 5000; i++) {
                inserter.add({test:'test'});
            }
            inserter.insert();
        }, 0);
    });
});
Vadim Baryshev
  • 25,689
  • 4
  • 56
  • 48
3

mongodb, just like any other database, takes some time to process requests. You're throwing a million requests at it, and since nothing in your code blocks, that means that at any time a whole bunch of them are going to be queued up somewhere (most likely in multiple places, with some of them inside the driver's code, others inside node's event loop). That takes more than a little bit of memory.

If the queuing didn't happen, you'd either block or drop some of the requests. There Ain't No Such Thing As A Free Lunch.

ebohlman
  • 14,795
  • 5
  • 33
  • 35
  • If I understand node correctly, there are no background threads, so *all* of them will be queued before the queue starts being processed (or at the very least before the first completion callback gets triggered). – Thilo Sep 07 '12 at 08:04
  • The mongo driver could be written in such a way that it does some async stuff internally when handling a request. – ebohlman Sep 07 '12 at 08:06
  • Okay. But the callbacks won't get executed (and dequeued) before the queuing loop is done, right? – Thilo Sep 07 '12 at 08:08