0

So I tested my scraping on a static HTML file before adding it to my Node app.

The problem is that it's not returning all the rows.

On the site:

$('#sport tr').length
//Returns 13

In Cheerio:

 $('#sport tr').length
    //Returns 2

I'm stumped, here is the code I'm using. I've contained the URL as proof, so you can visit it yourself if you wish.

I'm suspecting it's something to do with var $ = cheerio.load(html); however I'm not experienced in Cheerio to say outright that's the problem.

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){

var url = 'http://www.olbg.com/football.php';
    var json = [];
request(url, function(error, response, html){
    if(!error){
        var $ = cheerio.load(html);

        console.log($('#sport tr').length);
    var headers = [];
    $('#sport tr th').each(function(i, th) {
      var text = $(th).text();
      if (text.trim() !== "") {
        headers[i] = text.replace(/[\t\n\r\s]/mgi, '');
      }
    });

    $('#sport tr').each(function(i, tr) {

      // skip if header
      if (!$(tr).is('th')) {
        var temp = {};
        temp["Event"] = $(tr).find('td').eq(0).text().trim();

        temp["TopSelection"] = $(tr).find('td').eq(1).text().trim();

        temp["BookieOdds"] = $(tr).find('td').eq(2).text().trim();

        temp["OLBGRating"] = $(tr).find('td').eq(3).find('img').length;

        if (temp["Event"] !== "" || temp["TopSelection"] !== ""){
          json.push(temp);
        }

      }

    });



}


// To write to the system we will use the built in 'fs' library.
// In this example we will pass 3 parameters to the writeFile function
// Parameter 1 :  output.json - this is what the created filename will be called
// Parameter 2 :  JSON.stringify(json, null, 4) - the data to write, here we do an extra step by calling JSON.stringify to make our JSON easier to read
// Parameter 3 :  callback function - a callback function to let us know the status of our function

fs.writeFile('output.json', JSON.stringify(json), function(err){

    console.log('File successfully written!');

})

// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send(json);

    });
});
app.listen("8081");
console.log("Magic happens on port 8081"); 
exports = module.exports = app;
ZeroBased_IX
  • 2,667
  • 2
  • 25
  • 46

1 Answers1

1

The reason that you're not getting the expected result is because the (table) html on that page is mangled. If you look at the second <td> in the second <tr> of the table#sport, you'll see an "extra" </td>. This causes the <td> that the table#sport is inside to close (and an implicit closing of table#sport) on some parsers because that is the closest open <td>. So that is why the parser reports only 2 <tr>s instead of 13. The other <tr>s you're expecting are now outside of table#sport.

Probably your best bet is to pass the html through an HTML tidying program/script (e.g. this one with the clean option enabled) first before passing it to cheerio. After that, your selector should return the elements you're probably expecting.

mscdex
  • 104,356
  • 15
  • 192
  • 153
  • Can you please explain in what way the HTML is mangled? I built something similar in .NET without issues using HtmlAgilityPack but wanted to work in Node. – ZeroBased_IX Sep 06 '15 at 16:34
  • Thanks! I used https://www.npmjs.com/package/htmltidy (which is a wrapper of the one you suggested) and it worked perfectly. – ZeroBased_IX Sep 06 '15 at 18:02