0

I am writing a node script to bulk download files. In the example these are images from a file where each line has a filename and a URL. I would like this script to be scaleable up to millions of URLs to download.

Node JS streams seem to be a good way to do this as we can pipe the URL list in, http request the URL, and write the response to file.

This script is crashing my computer, and writing lost of blank jpg files. The pipe() methods do not seem to be handling the back pressure; it seems the script is requesting all the URLs in parallel immediately. How can I make this limit the number of simultaneous HTTP requests so that it can run consistently when scaling for downloading lots of URLs? Thanks.

'use strict';
var fs      = require('fs'),
    request = require('request'),
    through = require('through'),
    split   = require('split'),
    urlList = 'https://gist.githubusercontent.com/phelma/e1558aeb181c0cfe47b8/raw/cc5e667277308fda408f6af1404bc2d322b5186c/images.txt';
    // 10000 images

var splitByTab = through(function(buf) {
    var item = buf.toString().split('\t');
    this.queue(item);
});

var downloadStream = through(function(item) {
    // item is array [ filename , URL ]
    if (item[1]) {
        console.log('Requesting ' + item[1]);
        request
            .get(item[1])
            .on('error', function(err) {
                console.log('\nError: ' + err.message + '\n' + item[1]);
            })
            .pipe(fs.createWriteStream(__dirname + '/out/' + item[0] + '.jpg'));
    }
});

request
    .get(urlList) // Request the
    .pipe(split()) // Split file into rows
    .pipe(splitByTab) // Split each row into a array items
    .pipe(downloadStream); // Download each item
phelm
  • 71
  • 1
  • 4

1 Answers1

0

Flow control in through is done using pause/resume.

var requestLimit = 10, activeRequests = 0;
var downloadStream = through(function(item) {
    // item is array [ filename , URL ]
    if (item[1]) {
        if (activeRequests++ > requestLimit)
            this.pause();
        console.log('Requesting ' + item[1]);
        request
            .get(item[1])
            .on('error', function(err) {
                console.log('\nError: ' + err.message + '\n' + item[1]);
                if (--activeRequests <= requestLimit)
                    this.resume();                    
            })
            .on('response', function(e) { 
                if (--activeRequests <= requestLimit)
                    this.resume();
            })
            .pipe(fs.createWriteStream(__dirname + '/out/' + item[0] + '.jpg'));
    }
});
Ben Grimm
  • 4,316
  • 2
  • 15
  • 24