I have a list of data objects each of them containing a url to be scraped. Some of these urls are not valid but I still want the data object to fall through to reach item pipelines.
After @tomáš-linhart reply I understood that using a middleware will not work in this case as scrapy will not allow me to create request object in the first place.
An alternative is to return item instead of request if url is not valid.
Following is my code:
def start_requests(self):
rurls = json.load(open(self.data_file))
for data in rurls[:100]:
url = data['Website'] or ''
rid = data['id']
# skip creating requests for invalid urls
if not (url and validators.url(url)):
yield self.create_item(rid, url)
continue
# create request object
request_object = scrapy.Request(url=url, callback=self.parse, errback=self.errback_httpbin)
# populate request object
request_object.meta['rid'] = rid
self.logger.info('REQUEST QUEUED for RID: %s', rid)
yield request_object
The above code is throwing an error as shown. More than the error , I am not sure how to trace the issue's origin. :(
2017-09-22 12:44:38 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method RefererMiddleware.request_scheduled of <scrapy.spidermiddlewares.referer.RefererMiddleware object at 0x10f603ef0>>
Traceback (most recent call last):
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log
*arguments, **named)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/pydispatch/robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/spidermiddlewares/referer.py", line 343, in request_scheduled
redirected_urls = request.meta.get('redirect_urls', [])
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/item.py", line 74, in __getattr__
raise AttributeError(name)
AttributeError: meta
Unhandled Error
Traceback (most recent call last):
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/commands/crawl.py", line 58, in run
self.crawler_process.start()
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/crawler.py", line 285, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/twisted/internet/base.py", line 1243, in run
self.mainLoop()
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/twisted/internet/base.py", line 1252, in mainLoop
self.runUntilCurrent()
--- <exception caught here> ---
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/twisted/internet/base.py", line 878, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/engine.py", line 135, in _next_request
self.crawl(request, spider)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/engine.py", line 210, in crawl
self.schedule(request, spider)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/engine.py", line 216, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/scheduler.py", line 54, in enqueue_request
if not request.dont_filter and self.df.request_seen(request):
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/item.py", line 74, in __getattr__
raise AttributeError(name)
builtins.AttributeError: dont_filter
2017-09-22 12:44:38 [twisted] CRITICAL: Unhandled Error
Traceback (most recent call last):
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/commands/crawl.py", line 58, in run
self.crawler_process.start()
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/crawler.py", line 285, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/twisted/internet/base.py", line 1243, in run
self.mainLoop()
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/twisted/internet/base.py", line 1252, in mainLoop
self.runUntilCurrent()
--- <exception caught here> ---
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/twisted/internet/base.py", line 878, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/engine.py", line 135, in _next_request
self.crawl(request, spider)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/engine.py", line 210, in crawl
self.schedule(request, spider)
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/engine.py", line 216, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/core/scheduler.py", line 54, in enqueue_request
if not request.dont_filter and self.df.request_seen(request):
File "/myhome/.virtualenvs/myproj/lib/python3.5/site-packages/scrapy/item.py", line 74, in __getattr__
raise AttributeError(name)
builtins.AttributeError: dont_filter