0

I'm using Cheerio each function to parse some URLS and save all the data into MongoDB. my problem is that cheerio each function as synchronous. and I don't know when the parsing is ended to start to do something else. so How to make these functions Asynchronous ?

request(URL, function (error, response, html) {
  if (!error && response.statusCode == 200) {
    var $ = cheerio.load(html);
       var posturl = $('a',this).attr('href');  
     $('article').each(function(i, element){

     parse_url(posturl, i);

    });            

  }
});

here is my Parse URL function

function parse_url(url, i) {

request(url, function (error, response, html) {
  if (!error && response.statusCode == 200) {
    var $ = cheerio.load(html);

     var title = $('article h1').text();
     var postid = $('article').attr('id').substring(5);
     var image = $('article img').attr('src');
     var description = $('article p strong').text(); 
     var json = { title : title, postid : postid, image : image, decription : description};    

          collection.insert(json,function (err, result) {
      if (err) {
        console.log(err);
      } else {

      }


    });  
  }
});

}
Stranger B.
  • 9,004
  • 21
  • 71
  • 108

1 Answers1

2

Install the async-foreach package with npm install async-foreach --save. In your first request, change your $('articles').each method to:

var forEach = require('async-foreach').forEach;

request(URL, function (error, response, html) {
  if (!error && response.statusCode == 200) {
    var $ = cheerio.load(html);
    var posturl = $('a',this).attr('href');
    forEach($('article').get(), function(article, index) {
      this.async();
      parse_url(article, index);
    });
  }
});

Now you still have to make your parse_url function asynchronous as well, because it is currently blocking. To do that in Node, you use process.nextTick(fn), which is the equivalent of the browsers setTimeout(fn, 0) but much more efficient.

function parse_url(url, i) {
  process.nextTick(function () {
    request(url, function (error, response, html) {
      if (!error && response.statusCode == 200) {
        var $ = cheerio.load(html);

        var title = $('article h1').text();
        var postid = $('article').attr('id').substring(5);
        var image = $('article img').attr('src');
        var description = $('article p strong').text(); 
        var json = { title : title, postid : postid, image : image, decription : description};    

        collection.insert(json,function (err, result) {
          if (err) {
          console.log(err);
          } else {

          }
        });
      }
    });
  });
}

Hope ths solves your problem!

Sven
  • 5,155
  • 29
  • 53
  • I want to show a console log message when everything is done, where I should write that ? – Stranger B. Oct 15 '15 at 13:01
  • the `forEach` function takes 3 arguments. 1st is the array to iterate over, 2nd is the function to execute on each item and the last one is the done function, put your console.log in there. – Sven Oct 15 '15 at 18:01
  • This is not working ! when I use this.async(); on foreach functions blocks on the first item, and when I delete it the console log not appearing in the right time – Stranger B. Oct 15 '15 at 18:36