I am using Scrapy and Selenium with the chrome driver to scrape a website. I am not able to scrape the website using Scrapy only because there is some protection mechanism implemented in the website which is giving a 404. When I am using selenium with scrapy I am able to access the page html. But the problem is when I am using selenium I am not able to retain the session cookies on all the links accessed through scrapy. I want to set some session parameters like country, language etc.
# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
class SettingSpider(scrapy.Spider):
name = 'setting'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
def __init__(self):
self.driver = webdriver.Chrome()
def start_requests(self):
url = 'http://www.example.com/'
self.driver.get(response.url)
yield scrapy.Request(url, self.parse)
def parse(self, response):
csrf = response.xpath('//input[@name="CSRFToken"]/@value').extract_first().strip()
print('------------->' + csrf)
url = 'http://www.example.com/settings'
form_data = {'shippingCountry': 'ARE', 'language': 'en', 'billingCurrency': 'USD', 'indicativeCurrency': '',
'CSRFToken:': csrf}
yield scrapy.FormRequest(url, formdata=form_data, callback=self.after_post)
def getShippingCountry(self, response):
country = response.css("a#shipping-country::text").extract_first().strip()
return country
def after_post(self, response):
country = self.getShippingCountry(response)
print('------------->' + country)