I've run into a problem where I'm running out of memory because garbage collection doesn't free up resources efficiently enough to prevent many asynchronous functions from using it all up.
I am not keeping references to any objects made during the asynch functions so they should all be able to be collected once they go out of scope.
I'm recursing asynchronously through a directory tree making a plain text file to accompany every HTML file using cheerio.
I realize I can but some kind of delay or batching in to keep the script trying to convert huge numbers of files at once.
But how can I do so deterministically? Sometimes it works, sometimes I get an exception from cheerio but each time while working on a different file, which is why I think it's memory exhaustion and non deterministic.
I believe there's no way in JavaScript or NodeJS to exert control over garbage collection. What can be done?
My code:
var fs = require('fs')
, path = require('path')
, cheerio = require('cheerio')
;
var processFile = function(filePath) {
var ext = path.extname(filePath)
, res = false
;
if (ext === '.html' || ext === '.htm') {
res = true;
fs.readFile(filePath, 'utf8', function(err, data) {
var $;
try {
$ = cheerio.load(data);
$('script').remove();
// HTML to text
var pt = $('body').text().replace(/(\s|\r?\n)+/gm, function (ws) {
return /\r?\n.*\r?\n/.test(ws) ? '\n\n' : ' ';
}).trim();
var txtFilePath = filePath.substr(0, filePath.lastIndexOf('.')) + '.txt';
fs.writeFile(txtFilePath, pt, function(err) {
console.log('wrote "' + txtFilePath + '"');
});
} catch (exy) {
console.log('## BANG!', filePath);
throw exy;
}
});
}
return res;
}
var walkP = function(dir, done) {
fs.readdir(dir, function(err, list) {
if (err) return done(err);
var pending = list.length;
if (!pending) return done(null);
list.forEach(function(file) {
file = path.resolve(dir, file);
fs.stat(file, function(err, stat) {
if (stat && stat.isDirectory()) {
walkP(file, function(err, res) {
if (!--pending) done(null);
});
} else {
processFile(file);
if (!--pending) done(null);
}
});
});
});
};
walkP(process.cwd(), function(err, res) {
console.log('Walked ' + res);
});