0

This is a web scraping code written in node js.
Will this code always keep 5 concurrent request when queue has enough urls?
Why the console shows otherwise?

var request = require("request");
var cheerio = require("cheerio");
var fs = require('fs');

var concurrent_requests = 0;
var queue = [];
var baseUrl = "https://angularjs.org/";

function makeApiCall(url){
    if(url) {
        queue.unshift(url);
    }
    if(concurrent_requests<5) {
        var nextUrl = queue.pop();
        if(nextUrl) {
            concurrent_requests++;
            request(nextUrl, function (error, response, body) {
                var invalidUrl;
                concurrent_requests--;
                if(body) {
                    var $ = cheerio.load(body);
                    var anchors = $("a");
                    var data = "";
                    for (var i = 0; i < anchors.length; i++) {
                        url = $(anchors[i]).attr("href");
                        if(!url || url === "#" || url === "javascript:void(0)"){
                            invalidUrl = true;
                        }
                        else{
                             invalidUrl = false;
                        }

                        if (!invalidUrl) {
                            makeApiCall(url);
                            data += url + ", " + nextUrl + "\n";
                        }
                    }
                    //console.log(data);
                    fs.appendFile('urls.csv',data, function (err) {
                        if (err) throw err;
                    });
                }
                else{
                    makeApiCall();
                }
            });
        }
    }
     console.log(concurrent_requests);

}


makeApiCall(baseUrl);
rishabh dev
  • 1,711
  • 1
  • 15
  • 26

2 Answers2

1

Becoz, you have condition that states not to request more than 5 with an if statement.

if(concurrent_requests<5) {

This solution is not scalable as will go over the stack after certain recursive calls.

Hope it helps.

Kannaiyan
  • 12,554
  • 3
  • 44
  • 83
1

You are using if condition to check if the count of concurrent requests are less then five or not. But remember it is if statement, not loop. That means it will be called only once.

You are making a recursive call to your function makeApiCall inside the callback of the request. The callback of the request only runs when the request is fulfilled.

With above two points in mind, in your if condition you check if concurrent_requests<5 then you call request method, and your program goes ideal. After sometime when the request id fulfilled, the callback of request runs, which after some logic calls the makeApiCall again. So in every call you are calling request only once and then wait for that to resolve and then only your program proceed for next request.

If you want concurrent request then use a loop like this

function makeApiCall(url){
    if(url) {
        queue.unshift(url);
    }
    // Use a loop here
    while(concurrent_requests<5) {
        var nextUrl = queue.pop();
        if(nextUrl) {
            concurrent_requests++;
            request(nextUrl, function (error, response, body) {
                var invalidUrl;
                concurrent_requests--;
                if(body) {
                        ...
                        if (!invalidUrl) {
                            makeApiCall(url);
                            data += url + ", " + nextUrl + "\n";
                        }
                    }
                    ...
                }
                else{
                    makeApiCall();
                }
            });
        }
        else{
           // Remember to break out of loop when queue is empty to avoid infinite loop.
           break;
        }
    }
     console.log(concurrent_requests);

}
Prakash Sharma
  • 15,542
  • 6
  • 30
  • 37