I was trying to crawl amazon grocery uk, and to get the grocery categories, I was using the Associate Product Advertising api. My requests get enqueued however as the requests have an expiry of 15 mins, some requests are crawled after 15 mins of being enqueued which means they get expired by the time they are crawled and yield a 400 error. I was thinking of a solution of enqueueing requests in a batch, but even that will fail if the implementation controls processing them in batches as the problem is preparing the request in batches as opposed to processing them in batches. Unfortunately, Scrapy has little documentation for this use case, so how can requests be prepared in batches?
from scrapy.spiders import XMLFeedSpider
from scrapy.utils.misc import arg_to_iter
from scrapy.loader.processors import TakeFirst
from crawlers.http import AmazonApiRequest
from crawlers.items import (AmazonCategoryItemLoader)
from crawlers.spiders import MySpider
class AmazonCategorySpider(XMLFeedSpider, MySpider):
name = 'amazon_categories'
allowed_domains = ['amazon.co.uk', 'ecs.amazonaws.co.uk']
marketplace_domain_name = 'amazon.co.uk'
download_delay = 1
rotate_user_agent = 1
grocery_node_id = 344155031
# XMLSpider attributes
iterator = 'xml'
itertag = 'BrowseNodes/BrowseNode/Children/BrowseNode'
def start_requests(self):
return arg_to_iter(
AmazonApiRequest(
qargs=dict(Operation='BrowseNodeLookup',
BrowseNodeId=self.grocery_node_id),
meta=dict(ancestor_node_id=self.grocery_node_id)
))
def parse(self, response):
response.selector.remove_namespaces()
has_children = bool(response.xpath('//BrowseNodes/BrowseNode/Children'))
if not has_children:
return response.meta['category']
# here the request should be configurable to allow batching
return super(AmazonCategorySpider, self).parse(response)
def parse_node(self, response, node):
category = response.meta.get('category')
l = AmazonCategoryItemLoader(selector=node)
l.add_xpath('name', 'Name/text()')
l.add_value('parent', category)
node_id = l.get_xpath('BrowseNodeId/text()', TakeFirst(), lambda x: int(x))
l.add_value('node_id', node_id)
category_item = l.load_item()
return AmazonApiRequest(
qargs=dict(Operation='BrowseNodeLookup',
BrowseNodeId=node_id),
meta=dict(ancestor_node_id=node_id,
category=category_item)
)