1

I need to extract links from the url in loop , so basically I need to execute another time the function but I don't know how to made this with nodejs.

var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;

request(url , function(err,resp,body){
    $ = cheerio.load(body);
    links = $('a');
    $(links).each(function(i,link){
        console.log(url+$(link).attr('href'));
    }
   )
})

My question is about how to extract the links from this array because this code works correctly (This code shows in console the links) but I need to scrape these links.

The result will be scraping the urls inside each.

Heretic Monkey
  • 11,687
  • 7
  • 53
  • 122
kikes
  • 115
  • 6
  • 1
    You haven't asked a question, and you haven't said what's wrong with the code. Please see [How to Ask](https://stackoverflow.com/help/how-to-ask). – Patrick Roberts Feb 06 '19 at 23:05
  • There are a number of questions about scraping sites using these technologies on Stack Overflow; [Scraping links from website using Node.js, request, and cheerio?](https://stackoverflow.com/q/35304259/215552) for example. Perhaps you can expand on what you're not understanding? – Heretic Monkey Feb 06 '19 at 23:27

3 Answers3

0
var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
    $ = cheerio.load(body)
    var allLinks = []
    links = $('a');
    $(links).each(function(i,link){
        console.log(url+$(link).attr('href'))
        var currentLink = url+$(link).attr('href')
        allLinks.push(currentLink)
        if (i == links.length-1){
          useLinks(allLinks)
        }
    }
   )
})

function useLinks(allLinks){
  console.log(allLinks)
}

If you're asking how to extract the url from the links received from cheerio you're already doing it. If you'd like to use them elsewhere after the request is finished (e.g. for scraping again), then store them in an array and call a function to use the array after you iterate through the last link.

cb64
  • 825
  • 8
  • 16
0

It should look something like this:

let links = $('a').get().map(a => $(a).attr('href'))
pguardiario
  • 53,827
  • 19
  • 119
  • 159
0

I share my solution is like the question but with differents changues.

I don't extract all links only the link thah I pass by url.

var express = require('express');
var fs      = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
var arr2 = [];
app.get('/webscrape', function(req, res,body){  
    request(url , function(err,resp,body){
        var array2 = [];
        var array3 = [];
        $ = cheerio.load(body);
        links = $('a'); //jquery get all hyperlinks
        $(links).each(function(i, link){
            if($(link).attr('href').includes("baloncesto")){
                array2.push($(link).attr('href'));
            }
        }); 
        const uniqueLinks = new Set([...array2]);   
        uniqueLinks.forEach((d) => {    
        const row = []; // a new array for each row of data
            row.push(d);
            array3.push(row.join()); // by default, join() uses a ','
        }); 
        fs.writeFile('raaga_output.json', JSON.stringify(array3, null, 4), function(err){
            console.log('File successfully written! - Check your project directory for the raaga_output.json file');
        })      
        res.send('File successfully written! - Check your project directory for the raaga_output.json file');   
    })
})
app.listen('3000')
console.log('Web Scrape happens on port 3000');
exports = module.exports = app;

Everyone could use this without any problem.

kikes
  • 115
  • 6