I'm using nodejs with horseman to scrape a list of (known to be valid) urls, performing the same scraping operation on all of them in sequence. I try to catch the errors, but sometimes some exception still slips through and hangs the script. I can run the same urls 'batch' successfully once and get errors another time.
So far I've tried: - catching errors with catch; - using the promiseRetry module to retry the promise - using opening and closing a new horseman instance for every scrape operation
// users: [ { href: '?id=abc123' }, ... ]
// queuePromise: function which returns a chain of promises from an array of functions with callbacks
// promiseRetry: used to retry a promise :3, see promiseRetry documentation
fetch = function(users, options){
var baseUrl = 'https://www.somedomain.com/Users'
var user_queue = []
var items = []
var ci = 0
users.forEach(function (u){
u.url = baseUrl + u.href
user_queue.push(function (next){
promiseRetry(options, function (retry, number) {
var error = true
var horse = new Horseman()
horse
.open(u.url)
.catch(retry)
.evaluate(scrapeitems)
.then(function (_items){
error = false
console.log('Scraped user', ++ci, ':', u.url, ':\n\tfound ' + _items.length + ' items')
if (_items && _items.length) {
items = items.concat(_items)
}
})
.finally(function () {
horse.close()
if (error && number < options.retries) {
retry()
} else {
next()
}
})
})
})
})
return _t.queuePromise(user_queue).then(function (){ return items })
}
here is the uncaught error:
[...]
Scraped user 22: https://www.somedomain.com/Users?id=abc123 :
found 1 items
Unhandled rejection Error: Retrying
at createError (/Users/andrea/src/userscraper/node_modules/err-code/index.js:4:44)
at /Users/andrea/src/userscraper/node_modules/promise-retry/index.js:34:27
at Horseman.<anonymous> (/Users/andrea/src/userscraper/modules/fetcher.js:179:110)
at PassThroughHandlerContext.finallyHandler (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/finally.js:56:23)
at PassThroughHandlerContext.tryCatcher (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/util.js:16:23)
at Promise._settlePromiseFromHandler (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:512:31)
at Promise._settlePromise (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:569:18)
at Promise._settlePromise0 (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:614:10)
at Promise._settlePromises (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:689:18)
at Async._drainQueue (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:133:16)
at Async._drainQueues (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:143:10)
at Immediate.Async.drainQueues (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:17:14)
at runCallback (timers.js:570:20)
at tryOnImmediate (timers.js:550:5)
at processImmediate [as _immediateCallback] (timers.js:529:5)
Unhandled rejection Error: Failed to load url
at checkStatus (/Users/andrea/src/userscraper/node_modules/node-horseman/lib/index.js:292:16)
at tryCatcher (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/util.js:16:23)
at Function.Promise.attempt.Promise.try (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/method.js:39:29)
at Object.loadFinishedSetup [as onLoadFinished] (/Users/andrea/src/userscraper/node_modules/node-horseman/lib/index.js:290:43)
at /Users/andrea/src/userscraper/node_modules/node-phantom-simple/node-phantom-simple.js:636:30
at Array.forEach (native)
at IncomingMessage.<anonymous> (/Users/andrea/src/userscraper/node_modules/node-phantom-simple/node-phantom-simple.js:617:17)
at emitNone (events.js:91:20)
at IncomingMessage.emit (events.js:185:7)
at endReadableNT (_stream_readable.js:975:12)
at _combinedTickCallback (internal/process/next_tick.js:74:11)
at process._tickCallback (internal/process/next_tick.js:98:9)