I'm using the following to get all external Javascript references from a web page. How can I modify the code to search not only the url, but all pages of the website?
import httplib2
from BeautifulSoup import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://stackoverflow.com')
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('script')):
if link.has_key('src'):
if 'http' in link['src']:
print link['src']
First attempt to make it scrape two pages deep below. Any advice on how to make it return only unique urls? As is, most are duplicates. (note that all internal links contain the word "index" on the sites I need to run this on.)
import httplib2
from BeautifulSoup import BeautifulSoup, SoupStrainer
site = 'http://www.stackoverflow.com/'
http = httplib2.Http()
status, response = http.request(site)
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if link.has_key('href'):
if 'index' in link['href']:
page = site + link['href']
status, response = http.request(page)
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('script')):
if link.has_key('src'):
if 'http' in link['src']:
print "script" + " " + link['src']
for iframe in BeautifulSoup(response, parseOnlyThese=SoupStrainer('iframe')):
print "iframe" + " " + iframe['src']
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if link.has_key('href'):
if 'index' in link['href']:
page = site + link['href']
status, response = http.request(page)
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('script')):
if link.has_key('src'):
if 'http' in link['src']:
print "script" + " " + link['src']
for iframe in BeautifulSoup(response, parseOnlyThese=SoupStrainer('iframe')):
print "iframe" + " " + iframe['src']