import urllib
from bs4 import BeautifulSoup
import urlparse
import mechanize
url = "http://www.wholefoodsmarket.com/forums"
br = mechanize.Browser()
urls = [url]
visited = [url]
while len(urls)>0:
try:
br.open(urls[0])
urls.pop(0)
for link in br.links():
newurl = urlparse.urljoin(link.base_url,link.url)
b1 = urlparse.urlparse(newurl).hostname
b2 = urlparse.urlparse(newurl).path
newurl = "http://"+b1+b2
if newurl not in visited and urlparse.urlparse(url).hostname in newurl:
urls.append(newurl)
visited.append(newurl)
ur = urllib.urlopen(newurl)
soup = BeautifulSoup(ur.read())
html = soup.find_all()
print html
f = open('content.txt', 'a')
f.write(newurl)
f.write("\n")
print >>f.write(soup.title.string)
f.write("\n")
f.write(soup.head)
f.write("\n")
f.write(soup.body)
print >>f, "Next Link\n"
f.close()
except:
print "error"
urls.pop(0)
I am trying to recursively crawl html pages data upto 1 GB and then extract the relevant text data i.e discarding all code, html tags. Can someone suggest some link I can follow.