1

I have to scrape HTML documents from given url. On my localhost the Phantom JS script is returning the url fine. But on live server I get a 403 forbidden status

scraper.js

var system = require('system');
var page = require('webpage').create();

$url = system.args[1];

page.open($url, function(status) {


    if (status == "success") {

        var content = page.content;
        console.log(content);
    }

    phantom.exit();

});

PhantomJS command:

phantomjs scraper.js http://www.submarino.com.br/produto/126862765/

The scraper works fine on other pages. But the domain www.submarino.com.br and www.americanas.com.br don't work. I know it has something to do with Akamai. The response with error output is:

Response (#1, stage "start"): {"body":"","bodySize":300,"contentType":"text/html","headers":[{"name":"Server","value":"AkamaiGHost"},{"name":"Mime-Version","value":"1.0"},{"name":"Content-Type","value":"text/html"},{"name":"Content-Length","value":"300"},{"name":"Expires","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Connection","value":"close"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"start","status":403,"statusText":"Forbidden","time":"2016-08-10T00:38:13.540Z","url":"http://www.submarino.com.br/produto/126862765/"}
Response (#1, stage "end"): {"contentType":"text/html","headers":[{"name":"Server","value":"AkamaiGHost"},{"name":"Mime-Version","value":"1.0"},{"name":"Content-Type","value":"text/html"},{"name":"Content-Length","value":"300"},{"name":"Expires","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Connection","value":"close"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"end","status":403,"statusText":"Forbidden","time":"2016-08-10T00:38:13.541Z","url":"http://www.submarino.com.br/produto/126862765/"}

When it works fine it returns:

Response (#1, stage "start"): {"body":"","bodySize":30076,"contentType":"text/html;charset=UTF-8","headers":[{"name":"Content-Encoding","value":"gzip"},{"name":"Content-Type","value":"text/html;charset=UTF-8"},{"name":"Server","value":"Apache-Coyote/1.1"},{"name":"X-Powered-By","value":"JSF/1.2"},{"name":"x-tid","value":"CATALOGO-0d4d336f-c0f1-4b71-9663-28fa89b5c123"},{"name":"Cache-Control","value":"max-age=1800"},{"name":"Expires","value":"Wed, 10 Aug 2016 01:10:18 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:40:18 GMT"},{"name":"Connection","value":"keep-alive"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"start","status":200,"statusText":"OK","time":"2016-08-10T00:40:18.388Z","url":"http://www.submarino.com.br/produto/126862765/"}
Response (#1, stage "end"): {"contentType":"text/html;charset=UTF-8","headers":[{"name":"Content-Encoding","value":"gzip"},{"name":"Content-Type","value":"text/html;charset=UTF-8"},{"name":"Server","value":"Apache-Coyote/1.1"},{"name":"X-Powered-By","value":"JSF/1.2"},{"name":"x-tid","value":"CATALOGO-0d4d336f-c0f1-4b71-9663-28fa89b5c123"},{"name":"Cache-Control","value":"max-age=1800"},{"name":"Expires","value":"Wed, 10 Aug 2016 01:10:18 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:40:18 GMT"},{"name":"Connection","value":"keep-alive"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"end","status":200,"statusText":"OK","time":"2016-08-10T00:40:18.390Z","url":"http://www.submarino.com.br/produto/126862765/"}

I attempted cURLing this site from hurl.it and other cURL services and they can access the url. Is there something I can do? This is driving me crazy!

Vaviloff
  • 16,282
  • 6
  • 48
  • 56
pcezar91
  • 111
  • 11

1 Answers1

2

Most likely it's a geographical or suspicious IP range limitation. I've tried to open the url just now and was also denied the page, then accessed it via american proxy and was able to open it. Just use an american or brasilian proxy.

Also when scraping it's important to mimic a real browser behaviour as close as possible, so I'd suggest you add useragent and viewport emulation to your script:

page.viewportSize = { width: 1280, height: 800 };
page.settings.userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";

Also be sure to subscribe to error and console messages to be aware of any errors and messages from the target page.

page.onConsoleMessage = function(msg) {
  console.log('CONSOLE: ' + msg);
};

page.onError = function (msg, trace) 
{
    console.log(msg);
    trace.forEach(function(item) {
        console.log(' ', item.file, ':', item.line);
    })
}
Vaviloff
  • 16,282
  • 6
  • 48
  • 56
  • 1
    Which proxy service did you use? I attempted to use my pc as a proxy but it didn't work... – pcezar91 Aug 10 '16 at 15:22
  • Your PC still has the same IP address that Akamai is not happy with, no matter what program you use. Try to get a Digital Ocean droplet in NY, for example, and set up an SSH tunnel as a proxy. – Vaviloff Aug 10 '16 at 18:28
  • 1
    Actually I couldnt even setup the proxy with my localhost. cURL or using phantomjs actually work great on my localhost, I was trying to test it without having to pay for a proxy first. I stumbled by this link https://www.digitalocean.com/community/questions/http-1-1-403-forbidden-server-akamaighost when I was trying to figure out what to do about this. Maybe Digital Ocean IP range is also blocked? – pcezar91 Aug 10 '16 at 18:51
  • If you're having trouble working with proxies you can open another question. As for the question if DO or any other provider is also blocked there is no other way than to try and check. Here's an [invite](https://m.do.co/c/2ff6cab4a36b) that will give you $10 coupon to try D.O. out. You can destroy and recreate droplets time and again to get new IP addresses. – Vaviloff Aug 11 '16 at 04:47