2

I have a JavaScript heap out of memory in my Node.js application. I'm trying to insert 408 000 data in MongoDB with one call. I have two loop. The first loop goes from 1 to 24 and the second (inside the first loop) from 1 to 17 000. These data are the result of a NetCDF file. I'm parsing data from this file, I'm building the Model Object and I insert these data in MongoDB.

I see some posts on StackOverflow about this problem then I see than I can increase the node memory with --max_old_space_size. But I don't know if it's the good way. Maybe you have some suggestions to optimize my code ?

Here is my loops:

for (var time_pos = 0; time_pos < 24; time_pos++) {

    // This array contains 17 000 data
    var dataSliced = file.root.variables['pm10_conc'].readSlice(
        time_pos, time_size,
        level_pos, level_size,
        lat_from, lat_size,
        lng_from, lng_size
    );

    // Loop : 0 to 17 000
    for (var i = 0; i < dataSliced.length; i++) {
        var pollution = new Pollution();

        latitude   = current_lat;
        longitude  = currrent_lng;
        country    = country_name;
        model      = model_name;
        data_type  = type_name;
        level      = 0;
        datetime   = date;
        pollutants.pm10.description = description;
        pollutants.pm10.units = units;
        pollutants.pm10.concentration = dataSliced[i];

        pollution.save(function(err){
            if (err) throw err;
            console.log("Data saved");
        })
    }
}

And here is my error:

<--- Last few GCs --->

   56782 ms: Mark-sweep 1366.6 (1436.9) -> 1366.6 (1436.9) MB, 1943.5 / 0.0 ms [allocation failure] [GC in old space requested].
   58617 ms: Mark-sweep 1366.6 (1436.9) -> 1366.6 (1436.9) MB, 1834.9 / 0.0 ms [allocation failure] [GC in old space requested].
   60731 ms: Mark-sweep 1366.6 (1436.9) -> 1368.6 (1417.9) MB, 2114.3 / 0.0 ms [last resort gc].
   62707 ms: Mark-sweep 1368.6 (1417.9) -> 1370.7 (1417.9) MB, 1975.8 / 0.0 ms [last resort gc].


<--- JS stacktrace --->

==== JS stack trace =========================================

Security context: 0x3a7c3fbcfb51 <JS Object>
    1: fnWrapper [/var/www/html/Project/node_modules/hooks-fixed/hooks.js:185] [pc=0x6ccee7825d4] (this=0x3a7c3fbe6119 <JS Global Object>)
    2: fn [/var/www/html/Project/node_modules/mongoose/lib/schema.js:~250] [pc=0x6ccee7d8ffe] (this=0xd29dd7fea11 <a model with map 0x994a88e5849>,next=0x1cbe49858589 <JS Function fnWrapper (SharedFunctionInfo 0x3d8ecc066811)>,done=0x1cbe498586...

FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
 1: node::Abort() [node]
 2: 0x1098b2c [node]
 3: v8::Utils::ReportApiFailure(char const*, char const*) [node]
 4: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [node]
 5: v8::internal::Factory::NewTransitionArray(int) [node]
 6: v8::internal::TransitionArray::Insert(v8::internal::Handle<v8::internal::Map>, v8::internal::Handle<v8::internal::Name>, v8::internal::Handle<v8::internal::Map>, v8::internal::SimpleTransitionFlag) [node]
 7: v8::internal::Map::CopyReplaceDescriptors(v8::internal::Handle<v8::internal::Map>, v8::internal::Handle<v8::internal::DescriptorArray>, v8::internal::Handle<v8::internal::LayoutDescriptor>, v8::internal::TransitionFlag, v8::internal::MaybeHandle<v8::internal::Name>, char const*, v8::internal::SimpleTransitionFlag) [node]
 8: v8::internal::Map::CopyAddDescriptor(v8::internal::Handle<v8::internal::Map>, v8::internal::Descriptor*, v8::internal::TransitionFlag) [node]
 9: v8::internal::Map::CopyWithField(v8::internal::Handle<v8::internal::Map>, v8::internal::Handle<v8::internal::Name>, v8::internal::Handle<v8::internal::FieldType>, v8::internal::PropertyAttributes, v8::internal::Representation, v8::internal::TransitionFlag) [node]
10: v8::internal::Map::TransitionToDataProperty(v8::internal::Handle<v8::internal::Map>, v8::internal::Handle<v8::internal::Name>, v8::internal::Handle<v8::internal::Object>, v8::internal::PropertyAttributes, v8::internal::Object::StoreFromKeyed) [node]
11: v8::internal::LookupIterator::PrepareTransitionToDataProperty(v8::internal::Handle<v8::internal::JSObject>, v8::internal::Handle<v8::internal::Object>, v8::internal::PropertyAttributes, v8::internal::Object::StoreFromKeyed) [node]
12: v8::internal::StoreIC::LookupForWrite(v8::internal::LookupIterator*, v8::internal::Handle<v8::internal::Object>, v8::internal::Object::StoreFromKeyed) [node]
13: v8::internal::StoreIC::UpdateCaches(v8::internal::LookupIterator*, v8::internal::Handle<v8::internal::Object>, v8::internal::Object::StoreFromKeyed) [node]
14: v8::internal::StoreIC::Store(v8::internal::Handle<v8::internal::Object>, v8::internal::Handle<v8::internal::Name>, v8::internal::Handle<v8::internal::Object>, v8::internal::Object::StoreFromKeyed) [node]
15: v8::internal::Runtime_StoreIC_Miss(int, v8::internal::Object**, v8::internal::Isolate*) [node]
16: 0x6ccee4092a7
Aborted
[nodemon] app crashed - waiting for file changes before starting...

Do you know if there is a way to optimize my code or if increase node memory is the best way ?

EDIT

I've a worked solution. I tried to use mongoose insertMany() but I have again the fatal error allocation failed.

Then I removed the new Pollution and push my data in an array. After that I'm using collection.insert and async each like this :

var pollution = [];   

for (var time_pos = 0; time_pos < 24; time_pos++) {

    // This array contains 17 000 data
    var dataSliced = file.root.variables['pm10_conc'].readSlice(
        time_pos, time_size,
        level_pos, level_size,
        lat_from, lat_size,
        lng_from, lng_size
    );

    async.each(dataSliced, function (item, next){

        pollution.push({
            'longitude' :current_lat,
            'latitude'  :current_lng,
            'country'   :country_name,
            'model'     :model_name,
            'data_type' :type_name",
            'level'     :0,
            'datetime'  : date,
            'pollution': {
                'pm10': {
                    'description': description,
                    'units': units,
                    'concentration': item
                }
            }
        });

    }
}

Pollution.collection.insert(pollution, function(err, docs){
    if (err) throw err;

    console.log("Data saved");
});

If you have a better solution you can post your answer.

Community
  • 1
  • 1
John
  • 4,711
  • 9
  • 51
  • 101

1 Answers1

2

I hope this helps you and other... :-)

I have been doing quite intense research on what is best to import data to Mongodb. I have used Mongoimport and as well Mongoose with insertMany method (using Native Mongodb). I have read that it best to keep the batch sizes to about 100 for best performance. Here is my solution using insertMany. using Mongoimport is quite trivial (just one line of code). So I do not think it is necessary to post here.

In my example 602.198 records were first parsed to an array of objects and second imported to Mongodb with success.

It takes some memory to import the parsed objects into Mongodb so it is normally necessary to use below command to allow node use more memory, can read more here.

node --max_old_space_size=8000  partImportNew.js

To increase efficiency I split the array of objects into batches and using Promise.all that resolves when all of the promises in the iterable argument have been resolved.

If you have larger files and run out of memory even you increase the memory allowance by node, then you can split the files. Remove the headers before and add them in the csv parser instead.

To split the files:

$ split -l numberoflines filename
ex. split -l 1000000 term2.csv

lets say term2.csv has 5.000.001 lines and no headers. From above example you will get 6 files, 5 files with one million lines each and one file with one line.

Have a look on how I solved it in the function bulkImportToMongo in mongodb.js file.

console

➜  database git:(master) ✗ node --max_old_space_size=8000  partImport.js
Connected to db!
Time to parse file: : 5209.325ms
Disconnected from db!
Time to import parsed objects to db: : 153606.545ms
➜  database git:(master) ✗

parseCSV.js

const csv = require("fast-csv");

function promiseCSV(filePath, options) {
  return new Promise((resolve, reject) => {
    console.time("Time to parse file");
    var records = [];
    csv
      .fromPath(filePath, options)
      .on("data", record => {
        records.push(record);
      })
      .on("end", () => {
        console.timeEnd("Time to parse file");
        resolve(records);
      });
  });
}

module.exports = promiseCSV;

mongodb.js

const mongoose = require("mongoose");
mongoose.Promise = global.Promise;

function connectToMongo(databaseName) {
  mongoose.connect(`mongodb://localhost:27017/${databaseName}`, {
    keepAlive: true,
    reconnectTries: Number.MAX_VALUE,
    useMongoClient: true
  });
  console.log("Connected to db!");
}

function disconnectFromMongo() {
  mongoose.disconnect();
  console.log("Disconnected from db!");
}

function bulkImportToMongo(arrayToImport, mongooseModel) {
  const Model = require(`../../../models/${mongooseModel}`);
  const batchSize = 100;
  let batchCount = Math.ceil(arrayToImport.length / batchSize);
  let recordsLeft = arrayToImport.length;
  let ops = [];
  let counter = 0;
  for (let i = 0; i < batchCount; i++) {
    let batch = arrayToImport.slice(counter, counter + batchSize);
    counter += batchSize;
    ops.push(Model.insertMany(batch));
  }
  return Promise.all(ops);
}

module.exports.bulkImportToMongo = bulkImportToMongo;
module.exports.connectToMongo = connectToMongo;
module.exports.disconnectFromMongo = disconnectFromMongo;

partImport.js

const path = require("path");
const parseCSV = require("./helpers/parseCSV");
const {
  connectToMongo,
  disconnectFromMongo,
  bulkImportToMongo
} = require("./helpers/mongodb");

const filePath = path.join(__dirname, "../../data/parts.csv");
const options = {
  delimiter: ";",
  noheader: true,
  headers: [
    "facility",
    "partNumber",
    "partName",
    "partDescription",
    "netWeight",
    "customsTariff"
  ]
};

connectToMongo("autoMDM");
parseCSV(filePath, options)
  .then(records => {
    console.time("Time to import parsed objects to db");
    return bulkImportToMongo(records, "parts.js");
  })
  /*   .then(result =>
    console.log("Total batches inserted: ", result, result.length)
  ) */
  .then(() => {
    disconnectFromMongo();
    console.timeEnd("Time to import parsed objects to db");
  })
  .catch(error => console.log(error));
Isak La Fleur
  • 4,428
  • 7
  • 34
  • 50