1

I want to do web scraping of this site. I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.

So I tried to use Node.js and Phantom.js with Phantom.

This is my code:

var phantom = require('phantom');

// object of methods
var methods = {};
var loadInProgress = false;
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';

methods.download = async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.info('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.info(msg);
    });
    await page.on('onLoadStarted', function() {
        loadInProgress = true;
        console.log('Load started...');
    });
    await page.on('onLoadFinished', function() {
        loadInProgress = false;
        console.log('Load end');
    });

    const status = await page.open(url);
    console.log('STATUS:', status);

    const content = await page.property('content');
    console.log('CONTENT:', content);

    // submit
    await page.evaluate(function() {
        document.getElementById('crID%3a250').value = 'crID%3a250'; // France
        document.getElementById('timeID%3a79').value = 'timeID%3a79'; // 2015
        document.getElementById('varID%3a2').value = 'varID%3a2'; // Medium
        document.getElementById('ctl00_main_filters_anchorApplyBottom').submit(); // submit button
    });

    var result = await page.evaluate(function() {
        return document.querySelectorAll('html')[0].outerHTML; 
    });
    console.log('RESULT:', result);

    await instance.exit();
};

module.exports = methods;

(How can they select more countries and more years?)

I tried to select France as Country or Area, 2015 as a Year and medium as a Variants.

So crID%3a250 is id of element:

<input type="checkbox" id="crID%3a250" value="crID%3a250" name="France" />
<label for="crID%3a250">France</label><br />

timeID%3a79 is id of element:

<input type="checkbox" id="timeID%3a79" value="timeID%3a79" name="2015" />
<label for="timeID%3a79">2015</label><br />

varID%3a2 is id of element:

<input type="checkbox" id="varID%3a2" value="varID%3a2" name="Medium" />
<label for="varID%3a2">Medium</label><br />

And then ctl00_main_filters_anchorApplyBottom is id of button element:

<div class="All">
    <img src="_Images/IconUpdateResults.png" alt="Update" width="11px" height="11px" title="Apply filters" />&nbsp;<a href="javascript:;" id="ctl00_main_filters_anchorApplyBottom" title="Apply filters" onclick="ApplyFilters(SendFilterRequest);">Apply Filters</a>
</div>

But what I got is the web page itself (in HTML), not the data that interest me. So it's as if I had not selected any parameters. Why?


EDIT 1

After the advice of @Vaviloff I tried to change the code but without success. My server-side language is Node.js.

Using Phantom I modified the code like this:

methods.download = async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.log('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.log(msg);
    });

    const status = await page.open(url);
    console.log('\n\nSTATUS:', status);

    // submit
    await page.evaluate(function() {
        var countries = {
            'Albania': 'crID%3a8',
            'Austria': 'crID%3a40',
            'Belgium': 'crID%3a56',
            'Bulgaria': 'crID%3a100',
            'Croatia': 'crID%3a191',
            'Cyprus': 'crID%3a196',
            'Denmark': 'crID%3a208',
            'Estonia': 'crID%3a233',
            'Finland': 'crID%3a246',
            'France': 'crID%3a250',
            'Germany': 'crID%3a276',
            'Greece': 'crID%3a300',
            'Iceland': 'crID%3a352',
            'Ireland': 'crID%3a372',
            'Italy': 'crID%3a380',
            'Latvia': 'crID%3a428',
            'Netherlands': 'crID%3a528',
            'Norway': 'crID%3a578',
            'Poland': 'crID%3a616',
            'Portugal': 'crID%3a620',
            'Romania': 'crID%3a642',
            'Slovakia': 'crID%3a703',
            'Slovenia': 'crID%3a705',
            'Spain': 'crID%3a724',
            'Sweden': 'crID%3a752',
            'Switzerland': 'crID%3a756',
            'United Kingdom': 'crID%3a826'
        };
        // 2018 - 1980
        var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37']; 

        // select countries
        for(var c in countries) {
            document.getElementById(countries[c]).setAttribute('checked', true);
        }
        // select years
        for(var y in years) {
            document.getElementById(years[y]).setAttribute('checked', true);
        }
        // select variants
        document.getElementById('varID%3a2').setAttribute('checked', true); // medium
        // click button
        document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); 
    });

    console.log('\nWaiting 1.5 seconds...');    
   await timeout(1500);

   // get only the table contents
    var result = await page.evaluate(function() {
        return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
    });
    console.log('\n\nRESULT:', result);

    elaborateResult(result);

    await instance.exit();
};

function elaborateResult(res) {
    var el = document.createElement('html'); // ** ERROR HERE **
    el.innerHTML = result;
    console.log('\n\nTD ELEMENTS:', el.getElementsByTagName('td'));
    //var obj = utilFunc.createJsonObjectPop(year, country, population);
    //console.log(obj);
}

There are two errors:

  1. result contains only the values that are on the first page of the results, but with the selections made you get 22 pages of results and I don't understand how I can get all the values that interest me and link them in the variable result.
  2. assuming to have solved the problem in point (1), now I should elaborate the results obtained and create an object like this:

var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]

This is what the elaborateResult(res) function should do (of course, the function is not complete, I have to finish it but I get an error at the first line), but I get the error:

ReferenceError: document is not defined

So I changed my strategy and I tried not to use Phantom but a normal request:

var options = {
    uri: 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8&crID%3a40;timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
    transform: function(body) {
        return cheerio.load(body);
    }
};

methods.download = async function(req, res) {
    request(options)
    .then(function($) {
        console.log('\n\nTHEN: ', $);
    })
    .catch(function(err) {
        console.log('Error', err.stack());
    });
}

If I run this code I get:

THEN:  function (selector, context, r, opts) {
    if (!(this instanceof initialize)) {
      return new initialize(selector, context, r, opts);
    }
    opts = _.defaults(opts || {}, options);
    return Cheerio.call(this, selector, context, r || root, opts);
  }

In this case I have other problems.

  1. I don't know how to build the url. In the example above I chose Albania (crID% 3a8) and Austria (crID% 3a40) and 2015 as year (timeID% 3a79). Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095.
  2. I don't know how to select the years or how to select variants or how to change pages.

I feel a bit stupid but I can't get what I want... I'm stuck. Help would be very welcome!

  • You should create a new question for this **Edit 1**, because now you want to parse the HTML table and it is a different issue. But the initial problem — getting those data — was solved with my answer and I would appreciate you honoring my time and accepting the answer. – Vaviloff Apr 09 '18 at 05:05
  • @Vaviloff You are right. Thank you very much :-) –  Apr 09 '18 at 07:12

1 Answers1

1

There are several issues with your script that prevent successful scrape.

To check a checkbox, you don't set its value again (it's already set in HTML!), you set its checked attribute to true:

document.getElementById('crID%3a250').setAttribute("checked", true); // France

The button that submits the form is a hyperlink <a> which doesn't have a submit method, it should be clicked (it even has onClick function in the code)

 document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // submit the form

**The search request ** is sent through ajax and takes time to complete, so your script should wait for at least a second vefore trying to fetch the data. I'll show how to wait in the full working code below.

Next, you may get only the table data, no need to sip through all th HTML:

var result = await page.evaluate(function() {
    return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
});

Here's a bit trimmed down version of you script with issues corrected:

var phantom = require('phantom');

var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';

// A promise to wait for n of milliseconds
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));

(async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.info('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.info(msg);
    });

    const status = await page.open(url);
    await console.log('STATUS:', status);

    // submit
    await page.evaluate(function() {
        document.getElementById('crID%3a250').setAttribute("checked", true); // France
        document.getElementById('timeID%3a79').setAttribute("checked", true); // 2015
        document.getElementById('varID%3a2').setAttribute("checked", true); // Medium
        document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // click submit button
    });

    console.log('Waiting 1.5 seconds..');    
    await timeout(1500);

    // Get only the table contents
    var result = await page.evaluate(function() {
        return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
    });
    await console.log('RESULT:', result);

    await instance.exit();
})();

The last but not the least observation is that you could simply try to replay an ajax request made by the form and find out that the URL of search request works quite well on its own, when just opened in another tab:

search result is HTML

You don't even need a headless browser to get it, just cUrl/requests and process. It happens with sites a lot, so it's useful to check network tab in your browser devtools before scraping.

Update

And if there are so many results that they are scattered over several pages, there is one more parameter to be used in request: Page:

data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=variableID:12&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=461

Vaviloff
  • 16,282
  • 6
  • 48
  • 56
  • Thank you! Could you be more specific about how to use the second proposed solution? That is the one without using Phantom. Sorry but it's the first time I do web scraping so I have a lot of difficulties. I also saw that you used `(async function(req, res) {` instead of `methods.download = async function(req, res) {`. I used `methods.download = async function(req, res) {` because I need to do this process when I call a certain method (the `download()` function). But I do not think there are problems if I do `methods.download = async function(req, res) {`, no? –  Apr 08 '18 at 08:12
  • Furthermore there is also another problem. The table is split across multiple pages, so `result` contains only the first page of data, not the complete data.. –  Apr 08 '18 at 08:59
  • Updated the answer to address your latest comment, but I gotta say you could have figured this one youself :) the initial phantom/node script was quite good! – Vaviloff Apr 08 '18 at 10:18
  • `How to use second proposed solution` Just get the table directly from [URL](http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a903;timeID%3a180&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302) and scrape your data from there. What's you server-side language of choice? – Vaviloff Apr 08 '18 at 10:20
  • `instead of methods.download` Just simplified a code sample a little. Of course you do the same inside of a module, the initial way your script was written. – Vaviloff Apr 08 '18 at 10:21
  • Thanks for your help. I modified the main message, I hope you can help me. –  Apr 08 '18 at 11:59
  • You shouldn't change questions. Your initial issue is solve, if you've got have another problem, feel free to create a new question (you're allowed to create as many of them as you want). – Vaviloff Apr 09 '18 at 05:07