I have very big csv file (370GB). I have enough RAM(64 GB) running on windows 10.
I think following is the best way to process the data on my system but I'm not sure weather how to achieve it.
- I want to break it into 4 different csv files(because I have quad core System).
- Then process every file on different cores(using cluster).
- After processing the result should be combined into one.
Currently I'm using following code to fetch and process the data:
var fs = require('fs'),
util = require('util'),
stream = require('stream'),
es = require('event-stream'),
path = require("path");
var dir = path.join(__dirname,'./ttwe.csv');
var lineNr = 0;
var s = fs.createReadStream('AIR_Pre_Processed_Data_For_EDA_16th_June_2016.csv')
.pipe(es.split())
.pipe(es.mapSync(function(line){
// find /v /c "" AIR_Pre_Processed_Data_For_EDA_16th_June_2016.csv (command to get totel no of line which gives 37931757)
s.pause();
lineNr += 1;
let ttp=line.split("^")[1].replace(/_," ");
if(ttp !='NA' && ttp !='undefined' && ttp !=''){
fs.appendFile(dir,ttp+",\n");
}
process.stdout.write('\u001B[2J\u001B[0;0f');
console.log(lineNr," of 37931757 Lines: ",parseInt((lineNr/37931757)*100),"%");
s.resume();
})
.on('error', function(e){
console.log('Error while reading file.',e);
})
.on('end', function(){
console.log('Read entire file.')
})
);