I'm writing a program that scrapes a site for links, then scrapes these links for information. In order to scrape the site, it is necessary to log in first. And so the order is: Log in -> Scrape the index for links -> Scrape the links for info
The callback to the login function prints an empty array { results: [], hasMore: true }
, so something is wrong with my code (the scraping part works):
var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');
var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";
var credentials = {
username: 'user1',
password: 'passpass'
};
login(function (result) {
console.log(result);
});
function login(callback) {
request.post({
uri: loginUrl,
headers: { 'content-type': 'application/x-www-form-urlencoded' },
body: require('querystring').stringify(credentials)
}, function(err, res, body){
if(err) {
console.log("Login error");
return;
}
scrapeTorrents(url1, function (result) {
callback(result);
});
});
}
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
console.log("Main scrape error");
return;
}
var links = []
var $ = cheerio.load(body);
$('span.title').each(function(i, element){
var title = $(this);
var a = $(this).children().eq(0);
var detailsUrl = a.attr('href');
//console.log(detailsUrl);
links.push(detailsUrl);
});
scrapeTorrentDetails(links, function (result) {
callback(result);
});
});
}
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
});
}
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
}
Maybe Q promises would be better. How would I implement that in the code above?
If you're wondering what the code is for, I'm planning to modify Popcorn-time to use another torrent-tracker (without an API).
Thanks