I have written my own scrapy download middleware to simply check db for exist request.url, if so raise IgnoreRequestf
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
sql = """SELECT url FROM domain_sold WHERE url = %s;"""
try:
cursor = spider.db_connection.cursor()
cursor.execute(sql, (request.url,))
is_seen = cursor.fetchone()
cursor.close()
if is_seen:
raise IgnoreRequest('duplicate url {}'.format(request.url))
except (Exception, psycopg2.DatabaseError) as error:
self.logger.error(error)
return None
if IgnoreRequest is raised I expect the spider would continue onto another request but in my case the spider would still continue scraping that request and pipe through the item through my custom pipeline.
I currently have my setting for the dl mw as below
'DOWNLOADER_MIDDLEWARES' : { 'realestate.middlewares.RealestateDownloaderMiddleware': 99
could anyone suggest to why this is happening. Thanks