I want to do web scraping of this site. I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.
So I tried to use Node.js
and Phantom.js
with Phantom
.
But the user Vaviloff pointed out to me that a headless browser is not necessary because it is sufficient to use the URL of search request.
So I changed my strategy and I tried not to use Phantom
but a normal request
:
var cheerio = require('cheerio');
var request = require('request-promise');
var options = {
uri: 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8&crID%3a40;timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
transform: function(body) {
return cheerio.load(body);
}
};
methods.download = async function(req, res) {
request(options)
.then(function($) {
console.log('\n\nTHEN: ', $);
})
.catch(function(err) {
console.log('Error', err.stack());
});
}
If I run this code I get:
THEN: function (selector, context, r, opts) {
if (!(this instanceof initialize)) {
return new initialize(selector, context, r, opts);
}
opts = _.defaults(opts || {}, options);
return Cheerio.call(this, selector, context, r || root, opts);
}
In this case I have other problems.
- I don't know how to build the url.
In the example above I chose Albania (
crID% 3a8
) and Austria (crID% 3a40
) and 2015 as year (timeID% 3a79
). Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095. - I don't know how to select the years or how to select variants or how to change pages.
I'm interested in information about:
var countries = {
'Albania': 'crID%3a8',
'Austria': 'crID%3a40',
'Belgium': 'crID%3a56',
'Bulgaria': 'crID%3a100',
'Croatia': 'crID%3a191',
'Cyprus': 'crID%3a196',
'Denmark': 'crID%3a208',
'Estonia': 'crID%3a233',
'Finland': 'crID%3a246',
'France': 'crID%3a250',
'Germany': 'crID%3a276',
'Greece': 'crID%3a300',
'Iceland': 'crID%3a352',
'Ireland': 'crID%3a372',
'Italy': 'crID%3a380',
'Latvia': 'crID%3a428',
'Netherlands': 'crID%3a528',
'Norway': 'crID%3a578',
'Poland': 'crID%3a616',
'Portugal': 'crID%3a620',
'Romania': 'crID%3a642',
'Slovakia': 'crID%3a703',
'Slovenia': 'crID%3a705',
'Spain': 'crID%3a724',
'Sweden': 'crID%3a752',
'Switzerland': 'crID%3a756',
'United Kingdom': 'crID%3a826'
};
// 2018 - 1980
var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37'];
// medium
var variants = 'varID%3a2';
Only for completeness: once the data is selected, I need to create an object like this:
var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]
So I created a function like that:
methods.createJsonObjectPop = function(year, country, population) {
return {
year: year,
country: country,
population: population
};
}
Any advice would be of great help to me!
EDIT 1
The content is divided into pages. How can we get all the data? By opening all the pages and merging the data maybe? This was obvious. If X is the number of pages, I suppose I have to do different X requests.
And how does the site know which page is requested?
I think thanks to the url but I'm not sure (like http://...Page=3...
).
I imagine this pseudocode:
var basicUrl = 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8;crID%3a40;crID%3a56;crID%3a100;crID%3a191;crID%3a196;crID%3a208;crID%3a233;crID%3a246;crID%3a250;crID%3a276;crID%3a300;crID%3a352;crID%3a372;crID%3a380;crID%3a428;crID%3a528;crID%3a578;crID%3a616;crID%3a620;crID%3a642;crID%3a703;crID%3a705;crID%3a724;crID%3a752;crID%3a756;crID%3a826;timeID%3a83;timeID%3a82;timeID%3a81;timeID%3a79;timeID%3a78;timeID%3a77;timeID%3a76;timeID%3a75;timeID%3a73;timeID%3a72;timeID%3a71;timeID%3a70;timeID%3a69;timeID%3a67;timeID%3a66;timeID%3a65;timeID%3a64;timeID%3a63;timeID%3a61;timeID%3a60;timeID%3a59;timeID%3a58;timeID%3a57;timeID%3a55;timeID%3a54;timeID%3a53;timeID%3a52;timeID%3a51;timeID%3a49;timeID%3a48;timeID%3a47;timeID%3a46;timeID%3a45;timeID%3a43;timeID%3a42;timeID%3a41;timeID%3a40;timeID%3a39;timeID%3a37;varID%3a2&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=531';
let promises = [];
let allData = [];
var options = {
uri: url,
transform: function(body) {
return cheerio.load(body);
}
};
methods.download = async function(req, res) {
for(var i = 0; i < X; i++) {
var url = basicUrl + '&Page=' + i;
let res = await request(options, url);
let data = elaborateData(res);
allData.push(data);
}
return Promise.all(promises);
}
function elaborateData(res) {
var el = document.createElement('html');
// use javascript or jQuery to get data like:
// var year = getElementByTag(...);
// var country = getElementByTag(...);
// var population = getElementByTag(...);
return createJsonObjectPop(year, country, population);
}
EDIT 2
var basicUrl = 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8;crID%3a40;crID%3a56;crID%3a100;crID%3a191;crID%3a196;crID%3a208;crID%3a233;crID%3a246;crID%3a250;crID%3a276;crID%3a300;crID%3a352;crID%3a372;crID%3a380;crID%3a428;crID%3a528;crID%3a578;crID%3a616;crID%3a620;crID%3a642;crID%3a703;crID%3a705;crID%3a724;crID%3a752;crID%3a756;crID%3a826;timeID%3a83;timeID%3a82;timeID%3a81;timeID%3a79;timeID%3a78;timeID%3a77;timeID%3a76;timeID%3a75;timeID%3a73;timeID%3a72;timeID%3a71;timeID%3a70;timeID%3a69;timeID%3a67;timeID%3a66;timeID%3a65;timeID%3a64;timeID%3a63;timeID%3a61;timeID%3a60;timeID%3a59;timeID%3a58;timeID%3a57;timeID%3a55;timeID%3a54;timeID%3a53;timeID%3a52;timeID%3a51;timeID%3a49;timeID%3a48;timeID%3a47;timeID%3a46;timeID%3a45;timeID%3a43;timeID%3a42;timeID%3a41;timeID%3a40;timeID%3a39;timeID%3a37;varID%3a2&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=531';
let promises = [];
let allData = [];
var pages = 22; // data are splitting in 22 pages
methods.download = async function(req, res) {
for(var i = 0; i < pages; i++) {
var url = basicUrl + '&Page=' + i;
var options = {
uri: url,
transform: function(html) {
return cheerio.load(html);
}
};
let res = await request(options)
.then(function($) {
return $;
})
.catch(function(err) {
console.log('Error', err.stack());
});
console.log('\n\nRES:', res);
let data = elaborateData(res);
allData.push(data);
}
return Promise.all(promises);
}
function elaborateData($) {
$('.td').each(function() {
console.log($(this).text());
});
// use javascript or jQuery to get data like:
// var year = getElementByTag(...);
// var country = getElementByTag(...);
// var population = getElementByTag(...);
//return createJsonObjectPop(year, country, population);
}
If I run this code, I get:
RES: function (selector, context, r, opts) {
if (!(this instanceof initialize)) {
return new initialize(selector, context, r, opts);
}
opts = _.defaults(opts || {}, options);
return Cheerio.call(this, selector, context, r || root, opts);
}
EDIT 3
var cheerioTableparser = require('cheerio-tableparser');
methods.download = async function(req, res) {
for(var i = 0; i < 22; i++) {
var url = basicUrl + '&Page=' + i; // DOESN'T WORK
var options = {
uri: url,
transform: function(html) {
return cheerio.load(html);
}
};
let res = await request(options)
.then(function($) {
return $;
})
.catch(function(err) {
console.log('Error', err.stack());
});
//console.log('\n\nRES:', res);
let data = elaborateData(res);
allData.push(data);
}
return Promise.all(promises);
}
function elaborateData($) {
cheerioTableparser($);
var data = $('table').parsetable(true, true, true);
var countries = data[0];
var years = data[1];
var variants = data[2];
var values = data[3];
console.log('\ncountries:', countries);
console.log('\nyears:', years);
console.log('\nvariants:', variants);
console.log('\nvalues:', values);
// use javascript or jQuery to get data like:
// var year = getElementByTag(...);
// var country = getElementByTag(...);
// var population = getElementByTag(...);
//return createJsonObjectPop(year, country, population);
}
I get:
countries: [ 'Country or Area',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Albania',
'Austria',
'Austria',
'Austria',
'Austria',
'Austria',
'Austria',
'Austria',
'Austria',
'Austria',
'Austria',
'Austria' ]
years: [ 'Year(s)',
'2018',
'2017',
'2016',
'2015',
'2014',
'2013',
'2012',
'2011',
'2010',
'2009',
'2008',
'2007',
'2006',
'2005',
'2004',
'2003',
'2002',
'2001',
'2000',
'1999',
'1998',
'1997',
'1996',
'1995',
'1994',
'1993',
'1992',
'1991',
'1990',
'1989',
'1988',
'1987',
'1986',
'1985',
'1984',
'1983',
'1982',
'1981',
'1980',
'2018',
'2017',
'2016',
'2015',
'2014',
'2013',
'2012',
'2011',
'2010',
'2009',
'2008' ]
variants: [ 'Variant',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium',
'Medium' ]
values: [ 'Value',
'2934.363',
'2930.187',
'2926.348',
'2923.352',
'2920.775',
'2918.978',
'2920.039',
'2926.659',
'2940.525',
'2962.635',
'2991.651',
'3023.907',
'3054.331',
'3079.179',
'3097.747',
'3111.005',
'3119.029',
'3122.408',
'3121.970',
'3115.576',
'3103.759',
'3093.041',
'3092.228',
'3106.736',
'3140.595',
'3189.583',
'3240.587',
'3275.431',
'3281.454',
'3253.656',
'3197.067',
'3121.336',
'3041.007',
'2966.798',
'2901.592',
'2842.624',
'2788.314',
'2735.329',
'2681.239',
'8751.820',
'8735.453',
'8712.137',
'8678.657',
'8633.220',
'8577.782',
'8517.548',
'8459.864',
'8409.949',
'8370.038',
'8338.453' ]
It works, only I can only get the data from the first page.