I am writing a node.js application which scrapes a web page to get a bunch of anchor tags (links) (close to 50-60).
Then it goes through each of those links, scrapes that page and within the page it looks for a matching keyword inside a div. If that keyword is found then it is pushes the link and page name in an array which then has to be displayed to the user.
router.post('/', function (req, res) {
var medicineObjArray = [];
rp({
uri: 'http://www.mymainurl.com/index',
transform: function (body) {
return cheerio.load(body);
}
}).then(function ($) {
$('.remedy_list a').each((index, elem) => {
var text = $(elem).text();
var link = $(elem).attr('href');
if (text != '' && link != undefined) {
medicineObjArray.push({
pageName: text,
link: link
});
}
});
var promises = medicineObjArray.map(function(item, index){
rp(
{
uri : item.link,
method: 'GET',
transform: function(body){
return cheerio.load(body);
}
}
).then(function($) {
if ($('.content:contains("paralysis")').length > 0) {
var med = {
medicine: item.medicine,
link: item.link
};
return med;
} else {
return;
}
}).catch(function(err){
console.log('--------- ERROR getting the page data: ' + err);
});
});
return Promise.all(promises).then((data) => {
console.log('-- promised data: ' + data[0]); //is called even before the request is made to an individual link
console.log('-- promised data: ' + data[1]);
console.log('-- promised data: ' + data[2]);
});
}).catch(function (err) {
console.log('--------- ERROR Cheerio chocked');
});
});
However I get an array of null on my page. How do I wait till all the links are done processing and show the search result links only?