I have a list of about 250 website URLs from which I need all URLs to all webpages on that site. One problem is that some websites are so large that my program keeps crawling infinitely. I'm trying to set a limit on that through the following code but it isn't working:
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import IgnoreRequest
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from scrapy import Spider
class MySpider(Spider):
name = "spider"
allowed_domains = [
MY_250_DOMAINS_GO_HERE
]
start_urls = []
for domain in allowed_domains:
start_urls.append('http://%s' % domain)
output_file = open("iterable_links.txt","w+")
LIMIT = 10
count = 0
def parse(self, response):
if self.count >= self.LIMIT:
raise IgnoreRequest()
#raise CloseSpider(f"Scraped {self.LIMIT} items. Eject!")
self.count += 1
le = LinkExtractor()
domain = response.url.replace("http://","").replace("https://","").split("/")[0]
links = le.extract_links(response)
links = [k for k in links if domain in k.url]
output_file = open("iterable_links.txt","a+")
for link in links:
output_file.write("'" + link.url + "',\n")
yield Request(link.url, callback=self.parse)
"""
REFERENCE:
https://stackoverflow.com/questions/9561020/how-do-i-use-the-python-scrapy-module-to-list-all-the-urls-from-my-website
"""