I am trying to scrappe coursera webpage using PhantomJS. However when I try to do that it is unable to display the actual content instead it only shows loading. When we try to access coursera you can see an intermediate display showing loading and this is being shown. However since phantomJS is a headless browser should'nt it be able to retrieve the source code exactly as what a browser would do? I tried setting the timeouts, useragents but to no avail. Any pointers?
EDIT: Please find the code snippet for simple scrapping:
var webPage = require('webpage');
var system = require('system');
var page = webPage.create();
page.settings.resourceTimeout = 5000; // 5 seconds
var url = system.args[1];
page.open(url, function (status) {
if(status === 'success') {
var content = page.content;
console.log(content);
phantom.exit();
}
else
{console.log("Error!")
phantom.exit()
}
});
EDIT:
Been trying this a bit more but still no luck. (Just wondering if OP tried this further with any luck)
var page = require('webpage').create();
page.settings.resourceTimeout = 10000; // 5 seconds
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36';
var system = require('system');
var fs = require('fs');
if(system.args.length !== 3) {
console.log('Usage: phantomjs text-scraper.js <url> <output file>');
phantom.exit();
}
var url = system.args[1];
var outfile = system.args[2];
page.open(url);
//page.open(url, function(status) {
//var output = url + '\n';
//console.log(output);
//if(status === 'success') {
page.onLoadFinished = function(msg) {
var text = page.evaluate(function () {
return document.title + '\n' + document.body.innerText;
});
console.log(text);
//output += text;
//fs.write(outfile, output);
//phantom.exit()
//} else {
// console.log("Error!")
phantom.exit();
//}
//}
};