I'm trying to correctly read and write 1 million lines from the pile corpora with node.js. The problem is that I get an output file with one million lines but the 1 millionth line is the 900kish line of the original file.
This what I used:
fs =require("fs");
var buffer = '';
var rs1 = fs.createReadStream('00.jsonl',"ascii");
var ws = fs.createWriteStream('pile_1M.txt', {flags: 'w',encoding: 'ascii'});
var lines_num=0;
var max_read=1000000;
rs1.on('data', function(chunk) {
rs1.pause()
var lines = (buffer+chunk).split(/\r?\n/g);
for (var i = 0; i < lines.length && lines_num<max_read; ++i) {
lines_num++;
ws.write(lines[i]+"\n");}
}
if(lines_num>=max_read){
rs1.close()
console.log("close")
}
rs1.resume();
});
rs1.on('close', function () {
ws.close();
});
EDIT:
Thanks to @jfriend00 I came up with the following working solution:
fs =require("fs");
var buffer = '';
var rs1 = fs.createReadStream('/media/user/hdd2/pile/00.jsonl',"utf-8");
var ws = fs.createWriteStream('pile_1M_2.txt', {flags: 'w',encoding: 'utf-8'});
var lines_num=0;
var max_read=100000;
function json_line_to_text(line){
var index=line.indexOf(', "meta": {"');
var res=line.substring(10,index-1);
if(index==-1){
res=line;
}
return res;
}
rs1.on('data', function(chunk) {
rs1.pause();
var lines = (buffer+chunk).split(/\r?\n/g);
for (var i = 0; i < lines.length && lines_num < max_read; ++i) {
if (i == lines.length - 1) {
buffer = lines[i];
} else {
if (lines_num < max_read) {
ws.write(json_line_to_text(lines[i]) + "\n");
lines_num++;
}
}
}
if(lines_num>=max_read){
rs1.close()
console.log("close")
}
rs1.resume();
});
rs1.on('close', function () {
ws.close();
});
**I included the jsonline_to_text function which I initially thought was the true reason for the problem which it is not.