i'm new with scrapy, i'm trying to build a spider that must do this kind work:
Extract all links from a generic web page recursively and with a specific depth.
I'm tryng to do this with following code:
class MySpider(CrawlSpider):
settings.overrides['DEPTH_LIMIT'] = 1
name = "cnet"
allowed_domains = ["cnet.com"]
start_urls = ["http://www.cnet.com/"]
rules = (Rule (SgmlLinkExtractor(allow_domains=('cnet.com',)), callback="parse_items", follow= True),)
def parse_items(self, response):
print ""
print "PARSE ITEMS"
print ""
hxs = HtmlXPathSelector(response)
titles = hxs.select('//a')
items = []
for titles in titles:
item = NewsItem()
item ["title"] = titles.select("text()").extract()
item ["link"] = titles.select("@href").extract()
if(len(item["link"]) > 0) and (self.allowed_domains[0] in item["link"][0]):
print ""
print response.meta['depth']
print item ["title"]
print item ["link"]
print ""
items.append(item)
return(items)
But it seems to go on an INFINITE loop, Any suggestion?
Thanks a lot!