1

I am writing a node.js application which scrapes a web page to get a bunch of anchor tags (links) (close to 50-60).

Then it goes through each of those links, scrapes that page and within the page it looks for a matching keyword inside a div. If that keyword is found then it is pushes the link and page name in an array which then has to be displayed to the user.

    router.post('/', function (req, res) {
    var medicineObjArray = [];
    rp({
        uri: 'http://www.mymainurl.com/index',
        transform: function (body) {
            return cheerio.load(body);
        }
    }).then(function ($) {
        $('.remedy_list a').each((index, elem) => {
            var text = $(elem).text();
            var link = $(elem).attr('href');
            if (text != '' && link != undefined) {
                medicineObjArray.push({
                    pageName: text,
                    link: link
                });
            }
        });

        var promises = medicineObjArray.map(function(item, index){
            rp(
                {
                    uri : item.link,
                    method: 'GET',
                    transform: function(body){
                        return cheerio.load(body);
                    }
                }
            ).then(function($) {
                if ($('.content:contains("paralysis")').length > 0) {
                    var med = {
                        medicine: item.medicine,
                        link: item.link
                    };
                    return med;
                } else {
                    return;
                }
            }).catch(function(err){
                console.log('--------- ERROR getting the page data: ' + err);
            });
        });
    
        return Promise.all(promises).then((data) => {
          console.log('-- promised data: ' + data[0]); //is called even before the request is made to an individual link
          console.log('-- promised data: ' + data[1]);
          console.log('-- promised data: ' + data[2]);
        });

    }).catch(function (err) {
        console.log('--------- ERROR Cheerio chocked');
    });
});

However I get an array of null on my page. How do I wait till all the links are done processing and show the search result links only?

codeinprogress
  • 3,193
  • 7
  • 43
  • 69
  • No, it didn't work. In fact now its giving me ERROR getting the page dataRequestError: Error: connect ETIMEDOUT 84.254.113.210:80 – codeinprogress Aug 12 '18 at 12:16
  • Probably because 60 connections at the same time are to much for your bandwith ... – Jonas Wilms Aug 12 '18 at 12:17
  • Thats weird, it was working a few minutes ago. It didn't print the json array on page but at least it was making the call and getting me the correctly matched pages. – codeinprogress Aug 12 '18 at 12:21

0 Answers0