We can use the following:
request = Request(url="http://example.com")
request.meta['proxy'] = "host:port"
yield request
A simple implementation is like below:
import scrapy
class MySpider(scrapy.Spider):
name = "examplespider"
allowed_domains = ["somewebsite.com"]
start_urls = ['http://somewebsite.com/']
def parse(self, response):
# Here example.com is used. We usually get this URL by parsing desired webpage
request = scrapy.Request(url='example.com', callback=self.parse_url)
request.meta['proxy'] = "host:port"
yield request
def parse_url(self, response):
# Do rest of the parsing work
pass
If you want to use the proxy in initial:
Add the following as spider class field
class MySpider(scrapy.Spider):
name = "examplespider"
allowed_domains = ["somewebsite.com"]
start_urls = ['http://somewebsite.com/']
custom_settings = {
'HTTPPROXY_ENABLED': True
}
And then use start_requests()
method as below:
def start_requests(self):
urls = ['example.com']
for url in urls:
proxy = 'some proxy'
yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': proxy})
def parse(self, response):
item = StatusCehckerItem()
item['url'] = response.url
return item