1

I'm using nodejs with horseman to scrape a list of (known to be valid) urls, performing the same scraping operation on all of them in sequence. I try to catch the errors, but sometimes some exception still slips through and hangs the script. I can run the same urls 'batch' successfully once and get errors another time.

So far I've tried: - catching errors with catch; - using the promiseRetry module to retry the promise - using opening and closing a new horseman instance for every scrape operation

// users: [ { href: '?id=abc123' }, ... ]
// queuePromise: function which returns a chain of promises from an array of functions with callbacks
// promiseRetry: used to retry a promise :3, see promiseRetry documentation

fetch = function(users, options){
  var baseUrl = 'https://www.somedomain.com/Users'
  var user_queue = []
  var items = []
  var ci = 0

  users.forEach(function (u){

    u.url = baseUrl + u.href
    user_queue.push(function (next){
      promiseRetry(options, function (retry, number) {

        var error = true
        var horse = new Horseman()

        horse
        .open(u.url)
        .catch(retry)
        .evaluate(scrapeitems)
        .then(function (_items){
          error = false
          console.log('Scraped user', ++ci, ':', u.url, ':\n\tfound ' + _items.length + ' items')
          if (_items && _items.length) {
            items = items.concat(_items)
          }
        })
        .finally(function () {
          horse.close()
          if (error && number < options.retries) {
            retry()
          } else {
            next()
          }
        })

      })
    })
  })

  return _t.queuePromise(user_queue).then(function (){ return items })
}

here is the uncaught error:

[...]

Scraped user 22: https://www.somedomain.com/Users?id=abc123 :
  found 1 items

Unhandled rejection Error: Retrying
    at createError (/Users/andrea/src/userscraper/node_modules/err-code/index.js:4:44)
    at /Users/andrea/src/userscraper/node_modules/promise-retry/index.js:34:27
    at Horseman.<anonymous> (/Users/andrea/src/userscraper/modules/fetcher.js:179:110)
    at PassThroughHandlerContext.finallyHandler (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/finally.js:56:23)
    at PassThroughHandlerContext.tryCatcher (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/util.js:16:23)
    at Promise._settlePromiseFromHandler (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:512:31)
    at Promise._settlePromise (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:569:18)
    at Promise._settlePromise0 (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:614:10)
    at Promise._settlePromises (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:689:18)
    at Async._drainQueue (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:133:16)
    at Async._drainQueues (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:143:10)
    at Immediate.Async.drainQueues (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:17:14)
    at runCallback (timers.js:570:20)
    at tryOnImmediate (timers.js:550:5)
    at processImmediate [as _immediateCallback] (timers.js:529:5)

Unhandled rejection Error: Failed to load url
    at checkStatus (/Users/andrea/src/userscraper/node_modules/node-horseman/lib/index.js:292:16)
    at tryCatcher (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/util.js:16:23)
    at Function.Promise.attempt.Promise.try (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/method.js:39:29)
    at Object.loadFinishedSetup [as onLoadFinished] (/Users/andrea/src/userscraper/node_modules/node-horseman/lib/index.js:290:43)
    at /Users/andrea/src/userscraper/node_modules/node-phantom-simple/node-phantom-simple.js:636:30
    at Array.forEach (native)
    at IncomingMessage.<anonymous> (/Users/andrea/src/userscraper/node_modules/node-phantom-simple/node-phantom-simple.js:617:17)
    at emitNone (events.js:91:20)
    at IncomingMessage.emit (events.js:185:7)
    at endReadableNT (_stream_readable.js:975:12)
    at _combinedTickCallback (internal/process/next_tick.js:74:11)
    at process._tickCallback (internal/process/next_tick.js:98:9)
Zander Aze
  • 11
  • 3

0 Answers0