1

I am working on a social network graph, where I want to build a "six degrees of separation" tree based on adjacency lists that I get from an API.

For each person, the API will return an array of friends in the form [id1, id2, id3...] which is exactly what I want. But the catch is that there are a huge number of people, and the API only allows 400 calls/15 minutes. I can save the data in a local db, but I don't want to flood the API with requests.

The pseudocode for what I am doing is be something like this:

requestCharacter = function(id) {
    is this person in my db already? if true, return;
    else make api call(error, function(){loopFriends(character)}) {
       save character in database
    }
}

loopFriends(character){
   foreach(friend in character.friends) requestCharacter(friend);
}

And I've coded that, more or less, and it works ok, but since it keeps traversing the trees, and since people recur in one another's friend lists, it is grossly inefficient, and keeps busting the API limits

So what I want to do instead is to queue up the requests, check that something isn't in the queue yet before I add it, and run the queue in batches of 400 or fewer requests at a time. (So if the queue has 1200 in it it would run 400, wait 15 mins, run 400, wait 15 mins, run 400...)

I tried using async.js with its queue, and I was able to load a ton into the queue, but I don't think it ever actually ran. What's the best approach for a situation like this?

My actual non-queued code follows:

var lookupAndInsertCharacter = function(id){
  Character.findOne({ 'id': id }, function (err, person) {
    if (err) console.log(err);
    else {
      if(person!=null) {console.log('%s already exists in database, not saved', person.name); getCharacterFriends(id);}
      else insertCharacter(id, function(){getCharacterFriends(id)});
    };
  })
}

var insertCharacter = function(id, callback){
    var url = getCharacterURL(id);
    request(url, function (error, response, body) {
    if (!error && response.statusCode == 200) {
      var result = JSON.parse(body);
      if(result.status_code != 1 ) {console.log("ERROR status_code: %s. Please wait 15 minutes", result.status_code); return;}
      else {
        var me = new Character(processCharacter(result));
        me.save(function(err){
          if (err) return handleError(err);
        });
        console.log("Saved character "+me.name);
      }  
    } 
    else {
      console.log(error);
    }
  }); 
}

var getCharacterFriends = function(id) {
  Character.findOne({ 'id': id }, function (err, person) {
    if (err) console.log(err);
    else {
      console.log("Getting friends for %s",person.name);
      _.each(person.character_friends, function(d){
        lookupAndInsertCharacter(d);
      });
      console.log("Getting enemies for %s",person.name);
      _.each(person.character_enemies, function(d){
        lookupAndInsertCharacter(d);
      })
    };
  })
}
Charles Bandes
  • 795
  • 8
  • 21

2 Answers2

1

What ended up working for me was rate limiting the API call. I used

https://github.com/wankdanker/node-function-rate-limit

And then I made a limited version of insertCharacter:

var rateLimit = require('function-rate-limit');

var insertLimited = rateLimit(400, 900000, function (id) {
  insertCharacter(id);
});
Charles Bandes
  • 795
  • 8
  • 21
0

In the below example I get all my groups on FaceBook, the posts on it and the public profile of their authors.

To slow this process down, I created a limited pool of 'scrapers' and retain each scraper for a certain time, so I "cannot overload the FaceBook server :)"

For the above example, you could

  • limit your pool size to 400 max : 400 and retain your scrapers for 15 minutessetTimeout(function(){pool.release(scraper);}, 15*60*1000);
  • or limit your pool size to 1 max : 1 and retain your scrapers for 3.75 seconds setTimeout(function(){pool.release(scraper);}, 3750);

Here comes the code

function saveData (anyJson) {
    // put your Db communication here.
    // console.log(anyJson);
}

function now() {
    instant = new Date();
    return instant.getHours() +':'+ instant.getMinutes() +':'+ instant.getSeconds() +'.'+ instant.getMilliseconds();
}
var graph = require('fbgraph');
console.log(process.argv[2]);
graph.setAccessToken(process.argv[2]);

var poolModule = require('generic-pool');
var pool = poolModule.Pool({
    name     : 'scraper',
    create   : function(callback) {
        console.log(now() +' created scraper');
        // parameter order: err, resource
        callback(null, {created:now()});
    },
    destroy  : function(scraper) { 
        console.log(now() +' released scraper created '+ scraper.created); 
    },
    max      : 10,
    min      : 1, 
    idleTimeoutMillis : 60*60*1000,
    log : false
});

function pooledGraphGet(path,analyse) {
    pool.acquire(function(err,scraper) {
        if (err) {
            console.log(now() +' Could not get a scraper for '+ path);
            throw err;
        }
        graph.get(path,function(err,res) {
            if (err) {
                console.log(now() +' Could not get '+ path +' using scraper created '+ scraper.created);
                throw err;
            } else {
                console.log(now() +' Got '+ path +' using scraper created '+ scraper.created);
                setTimeout(function(){pool.release(scraper);}, 60*1000);
                analyse(res);
            }
        });
    });
}

pooledGraphGet('me?fields=friends,groups', function(res) {
    res.groups.data.forEach(function(group) {
        saveData (group);
        pooledGraphGet(group.id +'?fields=id,name,members,feed', function(res) {
            if (res.feed) res.feed.data.forEach(function(feed){
                saveData (feed);
                pooledGraphGet(feed.from.id +'?fields=id,name', function(res) {
                    saveData (res);
                });
            });
        });
    });
});
Dirk Horsten
  • 3,753
  • 4
  • 20
  • 37
  • Please forgive my utter noob-ness to node... If I were to use the generic-pool idea, would I put the API call function inside the pool as the 'client'? And is the 'wait a minute before releasing it' done by setting the idleTimeout to a minute? – Charles Bandes Jan 03 '15 at 17:54