3

I'm using nightmare.js to scrape webpage content.

After authenticating the nightmare instance, I loop through multiple pages (that require a login) and then call the run method that executes all the page loads.

From each page I want to grab specific content so I call the evaluate function which ensures we're executing inside the browser scope. Whatever is returned from the evaluate function becomes the argument in the run method

But I'm trying to run the evaluate function multiple times (once for each page). But the evaluate function can only return output to the run method once. I tried pushing content to a global variable (in the main scope) but can't access it from inside the browser scope.

Can anyone advise on how to have multiple evaluate methods run on one nightmare instance and extract information from each?

var Nightmare = require("nightmare");

//Creates the authenticated nightmare instance

var scraper = new Nightmare()
  .goto('https://www.example.com/signin')
  .type('#login', 'username')
  .type('#password', 'password')
  .click('#btn')
  .run(function(err, nightmare) {
    if (err) {
      console.log(err);
    }
    console.log('Done.');
  });

for (var i = 0; i < 4; i++) {
  scraper
    .goto('https://www.example.com/page'+i)
    .wait(1000)
    .evaluate(function(){
      return $('#result > h3').text()
    })
}

scraper.run(function(err, result) {
  console.log(result)
  if (err) {
    console.log(err);
  }
}); )
  • How does this relate to electron? (it has the electron tag) – justin.m.chase Feb 03 '16 at 15:40
  • @justin.m.chase Nightmare.js switched from PhantomJS to Electron recently. OP wants say that he uses a newer version. I would say that the version information must available in the question body. – Artjom B. Feb 03 '16 at 18:10
  • @ArtjomB. I see, interesting. I was actually just considering making a electron based runner like this, I will probably use this instead now. – justin.m.chase Feb 03 '16 at 22:59

1 Answers1

9

I don't really know much about nightmare specifically but it seems like you may have a problem simply with asynchrony, which is a hard problem in js in general.

The good news is that you can just restructure your code and rely on generators to make it work fairly simply.

The key to realize is that when you use the * before a function name then that function becomes a generator function, which allows you to use the yield keyword. Whenever you yield that line of code will wait for the returned promise to finish before going to the next line of code and it will return that yielded value as a result. You can use the vo library to convert the generator function into a callback, which returns an array of all yielded results.

var Nightmare = require('../nightmare')
var vo = require('vo')

vo(run)(function(err, result) {
  if (err) throw err
  console.log('#result > h3: ', result)
})

function *run() {
  var nightmare = Nightmare();
  yield nightmare
    .goto('https://www.example.com/signin')
    .type('#login', 'username')
    .type('#password', 'password')
    .click('#btn')

  for (var i = 0; i < 4; i++) {
    yield nightmare
      .goto('https://www.example.com/page'+i)
      .wait(1000)
      .evaluate(function(){
        return $('#result > h3').text()
      })
  }

  yield nightmare.end()
}
justin.m.chase
  • 13,061
  • 8
  • 52
  • 100
  • I just struggled with the same problem and this is the way to go. However, I see 2 problems. 1) maybe its a typo but `results` is not defined. 2) On the other hand, `result` will only contain the result from the last `.evaluate()`, not a list of all results. vo chains the yielded Promises one after another (Pipeline), so only the result of the last fulfilled Promise is available there. (vo version 4.0.2, maybe semantics of vo changed since then) – Insa Jun 04 '17 at 20:34
  • Thanks @Insa. I updated the code to reflect your comments. – justin.m.chase Jul 25 '17 at 21:24