I wrote a script in Node that iterates over a large MongoDB collection, returning a certain number of documents at a time.
The collection has this simple format:
{
name: 'One',
data: '...'
},
{
name: 'Two',
data: '...'
},
...
I'm doing this job with the Q library, using a sequence of promises that get run one after the other:
'use strict';
var Q = require('q');
var monk = require('monk');
var CHUNK_SIZE = 100;
var LIMIT = 1000;
var collection = monk('localhost/dictionary').get('entries');
var promiseFactory = function (j) {
return function (result) {
if (undefined !== result) { // if result is undefined, we are at the first or last iteration.
if (result.length) {
for (var k = 0, max = result.length; k < max; k++) {
console.log(result[k].name); // print name
// ... do something with the document here...
}
} else { // no more documents, end of the iteration
return; // implicitely returns undefined
}
}
// returns CHUNK_SIZE documents, starting from the j-th document
return collection.find({}, { limit: CHUNK_SIZE, skip: j, sort: { name: 1 }});
};
};
var funcs = [];
for (var i = CHUNK_SIZE; i <= LIMIT; i += CHUNK_SIZE) {
funcs.push(promiseFactory(i));
}
var loop = Q.fcall(promiseFactory(0));
funcs.forEach(function (f) {
loop = loop.then(f);
});
The script works well and does achieve what it was designed to do.
However, I would like to improve it:
- I'm hardcoding the number of documents in the collection (
LIMIT
). I would like to get rid of this variable and let the script detect when to stop. - I have a feeling that this approach may not be the most memory-efficient one. In my code,
funcs.forEach()
chains a lot of copies of the same function in one shot (to be exactLIMIT/CHUNK_SIZE
copies). Since I'm working on a very large collection, I was wondering if there's a way to chain a new function only if there are still documents left, while running through the collection.