I'm trying to parse data from a site,I use scrapy, but the site is protected by cloudflare. I found a solution, use cloudscraper, and this cloudscraper can really get around protection. But I don’t understand how it can be used with scrapy.
Trying to write something like this
import scrapy
from scrapy.xlib.pydispatch import dispatcher
import cloudscraper
import requests
from scrapy.http import Request, FormRequest
class PycoderSpider(scrapy.Spider):
name = 'armata_exper'
start_urls = ['https://arma-models.ru/catalog/sbornye_modeli/?limit=48']
def start_requests(self):
url = "https://arma-models.ru/catalog/sbornye_modeli/?limit=48"
scraper = cloudscraper.CloudScraper()
cookie_value, user_agent = scraper.get_tokens(url)
yield scrapy.Request(url, cookies=cookie_value, headers={'User-Agent': user_agent})
def parse(self, response):
....
getting an error
Traceback (most recent call last):
File "/usr/lib/python3.6/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log
*arguments, **named)
File "/usr/lib/python3.6/site-packages/pydispatch/robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "/usr/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 343, in request_scheduled
redirected_urls = request.meta.get('redirect_urls', [])
AttributeError: 'Response' object has no attribute 'meta'
Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python3.6/site-packages/scrapy/commands/crawl.py", line 58, in run
self.crawler_process.start()
File "/usr/lib/python3.6/site-packages/scrapy/crawler.py", line 309, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/usr/lib64/python3.6/site-packages/twisted/internet/base.py", line 1283, in run
self.mainLoop()
File "/usr/lib64/python3.6/site-packages/twisted/internet/base.py", line 1292, in mainLoop
self.runUntilCurrent()
--- <exception caught here> ---
File "/usr/lib64/python3.6/site-packages/twisted/internet/base.py", line 913, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python3.6/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw)
File "/usr/lib/python3.6/site-packages/scrapy/core/engine.py", line 135, in _next_request
self.crawl(request, spider)
File "/usr/lib/python3.6/site-packages/scrapy/core/engine.py", line 210, in crawl
self.schedule(request, spider)
File "/usr/lib/python3.6/site-packages/scrapy/core/engine.py", line 216, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "/usr/lib/python3.6/site-packages/scrapy/core/scheduler.py", line 91, in enqueue_request
if not request.dont_filter and self.df.request_seen(request):
builtins.AttributeError: 'Response' object has no attribute 'dont_filter'
please tell me how to do it right