0

I'm pulling text from N urls. First I get the N urls in linksOnPage and then i run a doOnPage function to get the text from each url. When i run code only 1 of the N urls gets processed through the function. I assume it's because the processing function is running asynchronously. How do I stack these up in a queue and run them all/ whats a better way to do this?

Here's the main JS code:

var nodeio, linksOnPage, lyricsFromLink, db;

nodeio = require('node.io');
db = require('./db');
db.loadDB();

var loadSong =  function(artist, title, lyrics){
    console.log("loadSong being called");
    var newSongObj = {};
    newSongObj['artist'] = artist;  
    newSongObj['title'] = title;
    newSongObj['lyrics'] = lyrics;
    //store the lyrics in a mongo table
    var newSong = new db.Song(newSongObj);
    newSong.save(function(err) {
        if(err){
            throw err;
        } else{
            console.log("saved with no errors!");
        }
    });
};
// generic utility for getting links on a page and running a function on each one
exports.linksOnPage = function(pageObj, linkSelector, doOnPage, contentSelector) {
    nodeio.scrape(function(){
        this.getHtml(pageObj.pageUrl, function(err, $) {
            var links = [];
            var i = 0;
            $(linkSelector).each(function(link) {
                var fullLink = pageObj.rootUrl + link.attribs.href
                links.push(fullLink);
                //run a function on each link
                console.log('getting lyrics for song: ', i);
                doOnPage(pageObj.artist, fullLink, contentSelector);
                i = i+1;
            });
            //this.emit(links);
        });
    });
}

// get the lyrics for a specific song 
exports.lyricsFromLink = function(artist, pageUrl, lyricsSelector) {
    nodeio.scrape(function(){
        this.getHtml(pageUrl, function(err, $) {
            var lyrics = "";
            console.log('before each statement');
            $(lyricsSelector).each(function(lyricParagraph) {
                lyrics = lyrics + " " + lyricParagraph.text;
            });
            console.log('after each statement');
            loadSong(artist, pageUrl, lyrics);
            this.emit(lyrics)
        });
    });
}
algorithmicCoder
  • 6,595
  • 20
  • 68
  • 117
  • Only one url gets processed in total or at max one url is processed at any time (all get done)? – user568109 Oct 17 '13 at 03:35
  • @algorithmicCoder Perhaps `doOnPage` simply throws an exception or the `linkSelector` is wrong? Even if `doOnPage` was doing asynchronous work there's no reason why the loop would stop. – plalx Oct 17 '13 at 03:43
  • yeah i realize that that's a bit weird perhaps the this.emit() call is somehow messing with the other calls?....linkSelector is definitely working fine. – algorithmicCoder Oct 17 '13 at 03:46
  • Can you show your `doOnPage`. Does it process any particular url every time (first/last) of the url set. – user568109 Oct 17 '13 at 07:00
  • lyricsFromLink is what's passed in as doOnPage...i think it's the this.emit() function which is required for the process to terminate...so i don't think i can just comment it out.. :( – algorithmicCoder Oct 17 '13 at 08:51

0 Answers0