I am on scraping actions on nodejs, I am using request to connect to site, cheerio to access to the data and mongodb to store the data extracted. Also I am using async.js to avoid infinite recursion.
I have got a memory problem because my process takes memory and do not free it. I think that the problem is on mongodb because if I don't use mongodb the memory remains stable.
This is my summarized code:
// Use function scrape_urls to process the urls
var q = self.asyn.queue(scrape_urls, 3);
//I push a bunch of urls ...
for (var j = 0; j < self.urls_data.length; j++) {
q.push(self.urls_data[j]);
}
q.drain = function () {
console.log("END");
};
function scrape_urls(data_url, next_action) {
request({
method: 'GET',
url: data_url.url
}, function (err, response, body) {
var $ = cheerio.load(body);
data = { // ... scraped data ... };
mongo_client.connect(connection_string, function (err, db) {
if (err) { return console.dir(err); }
var collection = db.collection('foo');
collection.insert(data);
next_action();
});
});
};
As I say, if I avoid to use mongodb and only I connect to the urls using request, the memory will not grow endless, I think that connecting to mongodb is the problem.
Any ideas?