My task is to parse a large json data (consists of 1 million URLs) and download images to local server, but while downloading we are losing large no of data (around 50k) which are not captured in log files
We are splitting large json file into several small json files. We have written a program in nodeJS which read URLs (consist of images) from JSON and download that images to local server, but in this process we are loosing a large amount of data which are not written in log files or not downloaded.
So what are the modification needs to be done in this code?
My JSON
[{
"url": "http://example1.com",
"id": "jweewewii833388efje.jpg",
"class": 2
},
{
"url": "http://example2.com",
"id": "987c2732561cb6bbc151cb6e184d019b.jpg",
"class": 24
},
{
"url": "http://example3.com",
"id": "141cbe32ed1f0071431677a2c7754a70.jpg",
"class": 22
},
{
"url": "http://example4.com",
"id": "291a09e237b43d9fa1bf64481a679eaf.png",
"class": 1
}]
Node Js code
var fs = require('fs'),
JSONStream = require('JSONStream'),
es = require('event-stream');
const download = require('image-downloader')
var util = require('util');
var log_file = fs.createWriteStream('logfiles/log13.log', {flags : 'w'});
var getStream = function () {
var jsonData = 'train.json',
stream = fs.createReadStream(jsonData, {encoding: 'utf8'}),
parser = JSONStream.parse('*');
return stream.pipe(parser);
};
getStream()
.pipe(es.mapSync(function (data) {
//console.log(data.url);
const options = {
url: data.url,
dest: './uploads/'+data.id // Save to /path/to/dest/image.jpg
}
download.image(options)
.then(({ filename, image }) => {
console.log('Saved to', filename) // Saved to /path/to/dest/image.jpg
})
.catch((err) => {
//errorlog.error(`Error Message : $(data.id)${err}`);
const logdata = function(d) { //
log_file.write(util.format(d) + '\n');
//log_stdout.write(util.format(d) + '\n');
};
console.error =logdata
console.error(" Error URL:: "+data.url+" ID::"+data.id+"::"+err)
})
}));
I want to download total data without losing any. if there is any error URL that will be written in log file Ex: If I am processing 10k data from Json file I want around 8k images will be downloaded and other 2k will be written to log file