I am performing readstream.pipe(transformstream).pipe(writeStream). The application is read xml from a file system using readstream --> do some manipulation based on different tags using transform stream --> write it to another file system.
During the transform operation inside the transformstream. I am doing buffer.toString() for every chunk as I need to manipulate string before pushing them:
myTransformStream._transform = function(chunk, encoding, done) {
var stringTobeManipulated = chunk.toString();
// perform changes on stringTobeManipulated and push it to writestream
}
the size of xmls can be upto 3 mb and I noticed I got divided to ~44 chunks.
Based on the given problem I have questions concerning memory consumption both in v8 heap and system:
1) I understand that buffers are stored outside the v8 memory. When I do a chunk.toString() in my _transform function, does it create a javascript string object inside the v8 memory ? If yes, I am assuming it will be garbage collected after it looses all its references.
2) As the buffers are part of the system memory I believe they are not garbage collected, so when is that memory freed up ?
3) Is the application a good use case of transform stream as I am converting every chunk to string ?
EDITED: May be I am not explaining myself clearly. Anyways I have been trying to find a ay of removing namespace from the xml tag before converting it to json. Here is the code I ended up with. Took advantage of memoization in javascrtipt. Please let me know if you guys find a better or more memory efficient way. We are using actionhero.js framework
var action = {};
var xml2js = require('xml2js');
var fs = require('fs');
/////////////////////////////////////////////////////////////////////
// metadata
action.name = 'removeNamespace';
action.description = 'I will parse the article related xml and yield the result';
action.inputs = {
'required': [],
'optional': []
};
action.blockedConnectionTypes = [];
action.outputExample = {
articleHeading: 'heading',
articleBody: 'body'
};
/////////////////////////////////////////////////////////////////////
// functional
action.run = function(api, connection, next) {
var stream = require('stream');
var removeNamespace = new stream.Transform();
var util = require('util');
var output = fs.createWriteStream('output.xml');
removeNamespace._transform = function(chunk, encoding, done) {
removeNamespace._transform.brokenTag = removeNamespace._transform.brokenTag || false;
removeNamespace._transform.brokenString = removeNamespace._transform.brokenString || '';
var convertedString = chunk.toString().trim();
if (removeNamespace._transform.brokenTag === true){
convertedString = removeNamespace._transform.brokenString + convertedString ;
}
removeNamespace._transform.brokenTag = false;
removeNamespace._transform.brokenString = '' ;
if (convertedString.lastIndexOf('<') > convertedString.lastIndexOf('>') ){
removeNamespace._transform.brokenString =convertedString.substring(convertedString.lastIndexOf('<'),convertedString.length+1);
convertedString = convertedString.substring(0,convertedString.lastIndexOf('<')) ;
removeNamespace._transform.brokenTag = true ;
}
convertedString = convertedString.replace(/>\s+</g, '><')
.replace.replace(/<[A-Za-z]+:/gi, '<')
.replace(/<\/[A-Za-z]+:/gi, '</');
done(null, convertedString);
};
var Writable = stream.Writable;
function xmlInMemory(keyToXml, options) {
Writable.call(this, options);
this[keyToXml] = new Buffer('');
this._write = function(chunk, encoding, callback) {
chunk = (Buffer.isBuffer(chunk)) ? chunk : new Buffer(chunk, encoding);
this[keyToXml] = Buffer.concat([this[keyToXml], chunk]);
callback();
};
}
util.inherits(xmlInMemory, Writable);
var source = fs.createReadStream('path/to/your/xmlfile');
var target = new xmlInMemory('keyToXml');
source.pipe(removeNamespace).pipe(target);
target.on('finish', function() {
var parser = new xml2js.Parser();
connection.response.xml2js = target.keyToXml.toString();
next(connection, true);
parser.parseString(target.keyToXml.toString(), function(err, result) {
connection.response.xml2js = result;
next(connection, true);
});
});
};
/////////////////////////////////////////////////////////////////////
// exports
exports.action = action;