4

I have very big csv file (370GB). I have enough RAM(64 GB) running on windows 10.

I think following is the best way to process the data on my system but I'm not sure weather how to achieve it.

  1. I want to break it into 4 different csv files(because I have quad core System).
  2. Then process every file on different cores(using cluster).
  3. After processing the result should be combined into one.

Currently I'm using following code to fetch and process the data:

var fs = require('fs'), 
    util = require('util'), 
    stream = require('stream'), 
    es = require('event-stream'),
    path = require("path");
var dir = path.join(__dirname,'./ttwe.csv');


var lineNr = 0;

var s = fs.createReadStream('AIR_Pre_Processed_Data_For_EDA_16th_June_2016.csv')
    .pipe(es.split())
    .pipe(es.mapSync(function(line){

        // find /v /c "" AIR_Pre_Processed_Data_For_EDA_16th_June_2016.csv (command to get totel no of line which gives 37931757)
        s.pause();

        lineNr += 1;
        let ttp=line.split("^")[1].replace(/_," ");
        if(ttp !='NA' && ttp !='undefined' && ttp !=''){
            fs.appendFile(dir,ttp+",\n");
        }
        process.stdout.write('\u001B[2J\u001B[0;0f');
        console.log(lineNr," of 37931757 Lines: ",parseInt((lineNr/37931757)*100),"%");

        s.resume();
    })
    .on('error', function(e){
        console.log('Error while reading file.',e);
    })
    .on('end', function(){
        console.log('Read entire file.')
    })
);
hippietrail
  • 15,848
  • 18
  • 99
  • 158
Akhilesh Kumar
  • 9,085
  • 13
  • 57
  • 95
  • https://www.npmjs.com/package/csvtojson#multi-cpu-core-support .. Try this it has multicore support. – Satyam S May 23 '17 at 14:57

1 Answers1

2

Theres a package that splits this huge file into smaller ones: csv-split-stream. csv-split-stream.

You could define the max chunk on each file and then process them separately.

const csvSplitStream = require('csv-split-stream');


return csvSplitStream.split(
  fs.createReadStream('input.csv'),
  {
    lineLimit: 100
  },
  (index) => fs.createWriteStream(`output-${index}.csv`)
)
.then(csvSplitResponse => {
  console.log('csvSplitStream succeeded.', csvSplitResponse);
  // outputs: {
  //  "totalChunks": 350,
  //  "options": {
  //    "delimiter": "\n",
  //    "lineLimit": "10000"
  //  }
  // }
}).catch(csvSplitError => {
  console.log('csvSplitStream failed!', csvSplitError);
});

got it here

Community
  • 1
  • 1
cmarqs
  • 21
  • 3