Web Crawler problems in Python

Question

I've been working on creating a single-threaded Web Crawler in Python that will group the assets of each page and output a JSON array of the form:

[
  {
    url: 'http://url.com/',
    assets: [
      'http://url.com/imgs/img1.jpg',
      'http://url.com/css/style.css',
    ]

  },
  {
    url: 'http://url.com/services',
    assets: [
      'http://url.com/imgs/services.jpg',
      'http://url.com/css/style.css',
    ]

  },
  ...
]

To quickly summarise the functionality:

Using BeautifulSoup to parse HTML and extract links
Using urlparse to:
- Build absolute urls from relative urls
- Check if url is local using netloc
- Add visited urls/assets to dictionaries via their paths
Using robotparser to check if I can crawl each page I find by looking at the robots.txt file
- In order to do this I pass the root of the website to the crawler, i.e. ./crawl.py http://sitename.com/ (including the final slash)

I've made the assumption that if the url ends in .html or the resource path doesn't contain a . that I will be able to crawl it as a HTML page.

I've been having some problem with a few things including:

locales - Is there a smart way to detect and avoid crawling the same pages in different locales?
- When trying to crawl particular sites I'll end up with a maximum recursion depth exceeded message from Python.
- I tried to avoid this by checking if a links rel attribute contained alternate but this doesn't seem to have a big impact.
- An example of this is crawling http://url.com/ but also having to crawl http://url.com/en-us, http://url.com/en-au, etc.
angular/react - Is it possible to crawl sites that are using angular/react/similar frameworks?
- I've been trying to search for useful resources to use to help me in this section but so far haven't found anything concrete.

Any info/feedback is greatly appreciated

Code below:

#!/usr/bin/python

import sys
import json
import urlparse
import robotparser
import urllib2
from queue import Queue
from bs4 import BeautifulSoup

class Crawler:

    def gethtml(self, url):
        try:
            return urllib2.urlopen(url)
        except urllib2.HTTPError, e:
            print 'We failed with error code - %s.' % e.code

            if e.code == 404:
                print('404 File Not Found: ' + url)
            else:
                print('e code not 404')
            return None

    def __init__(self):
        url = sys.argv[1]
        sys.setrecursionlimit(100000)
        parsedurl = urlparse.urlparse(url)

        print('Crawling from URL: ' + url)

        self.parser = robotparser.RobotFileParser()
        self.parser.set_url(url + 'robots.txt')
        self.parser.read()

        if parsedurl.netloc.startswith('www.'): # compare netlocs without www.
            self.netloc = parsedurl.netloc[4:]
        else:
            self.netloc = parsedurl.netloc

        html = self.gethtml(url)

        if html is not None:
            self.visited = {}
            self.current = {}
            self.currentassets = {}
            self.output = []
            self.queue = Queue()
            if len(parsedurl.path) < 1:
                self.visited['/index.html'] = True
            self.crawlhtml(url, html)
        else:
            print("Sorry, couldn't find HTML at that URL!")

    def isabsolute(self, url):
        return bool(urlparse.urlparse(url).netloc)

    def checkifhtml(self, url):
        parsedurl = urlparse.urlparse(url)
        path = parsedurl.path

        if url.endswith('.html') or '.' not in path: # path is a html file
            if not self.visited.has_key(path):
                self.queue.enqueue(url)
                return True
        else:
            return False

    def getasseturl(self, current_url, url):
        if not self.isabsolute(url):  # make our relative url absolute
            url = urlparse.urljoin(current_url, url)

        parsedurl = urlparse.urlparse(url)
        path = parsedurl.path
        netloc = parsedurl.netloc
        local = False

        if netloc.startswith('www.'):  # check if is local url
            netloc = netloc.replace('www.', '', 1)

        if netloc == self.netloc:
            local = True

        if self.currentassets.get(path) is None:
            self.currentassets[path] = True
            if local:
                if self.checkifhtml(url) is False:
                    self.current['assets'].append(url)

    def checkqueue(self):
        print('Checking queue. Queue Size: ' + str(self.queue.size()))
        if self.queue.size() == 0:
            print('\n------------------------------------------------------\n')
            print(json.dumps(self.output, indent=4))
            print('\n------------------------------------------------------\n')
            print(self.visited)
        else:
            url = self.queue.dequeue()
            parsedurl = urlparse.urlparse(url)
            path = parsedurl.path
            if self.visited.get(path) is None:
                self.visited[path] = True
                html = self.gethtml(url)
                if html is not None:
                    self.crawlhtml(url, html)
                else:
                    self.checkqueue()
            else:
                self.checkqueue()

    def crawlhtml(self, url, html):
        print('---------------------------------------\nLooking at url: ' + url)
        if self.parser.can_fetch('*', url):
            self.current['url'] = url
            self.current['assets'] = []

            parsedhtml = BeautifulSoup(html, 'lxml')  # use lxml for speed

            for link in parsedhtml.find_all(['a', 'link', 'area', 'base', 'image']):
                if link.get('href') is not None:
                    if link.get('rel') is None:
                        self.getasseturl(url, link.get('href'))
                    else:
                        if not 'alternate' in link.get('rel'):
                            self.getasseturl(url, link.get('href'))
            for link in parsedhtml.find_all(['script', 'img', 'frame', 'iframe', 'input', 'audio', 'embed', 'source', 'video']):
                if link.get('src') is not None:
                    self.getasseturl(url, link.get('src'))

            self.output.append(self.current)
            self.current = {}
            self.currentassets = {}

        self.checkqueue()

c = Crawler()

Example output for input as http://firefox.com/: http://codebeautify.org/jsonviewer/cb716359 — Connor Cartwright, Dec 13 '16 at 00:50

Web Crawler problems in Python

0 Answers0