I've been working on creating a single-threaded Web Crawler in Python that will group the assets of each page and output a JSON array of the form:
[
{
url: 'http://url.com/',
assets: [
'http://url.com/imgs/img1.jpg',
'http://url.com/css/style.css',
]
},
{
url: 'http://url.com/services',
assets: [
'http://url.com/imgs/services.jpg',
'http://url.com/css/style.css',
]
},
...
]
To quickly summarise the functionality:
- Using
BeautifulSoup
to parse HTML and extract links Using
urlparse
to:- Build
absolute
urls fromrelative
urls - Check if url is local using
netloc
- Add visited urls/assets to dictionaries via their
paths
- Build
Using
robotparser
to check if I can crawl each page I find by looking at therobots.txt
file- In order to do this I pass the root of the website to the crawler, i.e.
./crawl.py http://sitename.com/
(including the final slash)
- In order to do this I pass the root of the website to the crawler, i.e.
I've made the assumption that if the url ends in .html
or the resource path doesn't contain a .
that I will be able to crawl it as a HTML page.
I've been having some problem with a few things including:
locales
- Is there a smart way to detect and avoid crawling the same pages in different locales?- When trying to crawl particular sites I'll end up with a
maximum recursion depth exceeded
message from Python. - I tried to avoid this by checking if a links
rel
attribute containedalternate
but this doesn't seem to have a big impact. - An example of this is crawling
http://url.com/
but also having to crawlhttp://url.com/en-us
,http://url.com/en-au
, etc.
- When trying to crawl particular sites I'll end up with a
angular
/react
- Is it possible to crawl sites that are usingangular
/react
/similar frameworks?- I've been trying to search for useful resources to use to help me in this section but so far haven't found anything concrete.
Any info/feedback is greatly appreciated
Code below:
#!/usr/bin/python
import sys
import json
import urlparse
import robotparser
import urllib2
from queue import Queue
from bs4 import BeautifulSoup
class Crawler:
def gethtml(self, url):
try:
return urllib2.urlopen(url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
if e.code == 404:
print('404 File Not Found: ' + url)
else:
print('e code not 404')
return None
def __init__(self):
url = sys.argv[1]
sys.setrecursionlimit(100000)
parsedurl = urlparse.urlparse(url)
print('Crawling from URL: ' + url)
self.parser = robotparser.RobotFileParser()
self.parser.set_url(url + 'robots.txt')
self.parser.read()
if parsedurl.netloc.startswith('www.'): # compare netlocs without www.
self.netloc = parsedurl.netloc[4:]
else:
self.netloc = parsedurl.netloc
html = self.gethtml(url)
if html is not None:
self.visited = {}
self.current = {}
self.currentassets = {}
self.output = []
self.queue = Queue()
if len(parsedurl.path) < 1:
self.visited['/index.html'] = True
self.crawlhtml(url, html)
else:
print("Sorry, couldn't find HTML at that URL!")
def isabsolute(self, url):
return bool(urlparse.urlparse(url).netloc)
def checkifhtml(self, url):
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if url.endswith('.html') or '.' not in path: # path is a html file
if not self.visited.has_key(path):
self.queue.enqueue(url)
return True
else:
return False
def getasseturl(self, current_url, url):
if not self.isabsolute(url): # make our relative url absolute
url = urlparse.urljoin(current_url, url)
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
netloc = parsedurl.netloc
local = False
if netloc.startswith('www.'): # check if is local url
netloc = netloc.replace('www.', '', 1)
if netloc == self.netloc:
local = True
if self.currentassets.get(path) is None:
self.currentassets[path] = True
if local:
if self.checkifhtml(url) is False:
self.current['assets'].append(url)
def checkqueue(self):
print('Checking queue. Queue Size: ' + str(self.queue.size()))
if self.queue.size() == 0:
print('\n------------------------------------------------------\n')
print(json.dumps(self.output, indent=4))
print('\n------------------------------------------------------\n')
print(self.visited)
else:
url = self.queue.dequeue()
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if self.visited.get(path) is None:
self.visited[path] = True
html = self.gethtml(url)
if html is not None:
self.crawlhtml(url, html)
else:
self.checkqueue()
else:
self.checkqueue()
def crawlhtml(self, url, html):
print('---------------------------------------\nLooking at url: ' + url)
if self.parser.can_fetch('*', url):
self.current['url'] = url
self.current['assets'] = []
parsedhtml = BeautifulSoup(html, 'lxml') # use lxml for speed
for link in parsedhtml.find_all(['a', 'link', 'area', 'base', 'image']):
if link.get('href') is not None:
if link.get('rel') is None:
self.getasseturl(url, link.get('href'))
else:
if not 'alternate' in link.get('rel'):
self.getasseturl(url, link.get('href'))
for link in parsedhtml.find_all(['script', 'img', 'frame', 'iframe', 'input', 'audio', 'embed', 'source', 'video']):
if link.get('src') is not None:
self.getasseturl(url, link.get('src'))
self.output.append(self.current)
self.current = {}
self.currentassets = {}
self.checkqueue()
c = Crawler()