I'm trying to implement errbacks to catch exceptions in request processing based on the documentation but I can't get it to work. The difference between the documentation example and my code is that mine is a Broad Crawl. I also tried the stackoverflow answer found here but I couldn't get it to work. My end goal is to catch those exceptions and pass them into Items
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from crawler.items import DomainItem
from scrapy.loader import ItemLoader
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
denylist = [
'google.com',
]
class Crawler (CrawlSpider):
name = "crawler"
rules = (Rule(LinkExtractor(allow=[r'quotes.toscrape1.com'], deny=(denylist)), follow=True,
callback='parse_item', errback='error_back'),)
start_urls = [
"http://quotes.toscrape1.com/",
]
def parse_item(self, response):
self.logger.info('LOGGER %s', response.url)
l = ItemLoader(item=DomainItem(), response=response)
l.add_value('domain', response.url)
return l.load_item()
def error_back(self, failure):
# log all failures
self.logger.error(repr(failure))
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)