Can't figure out why CrawlSpider in scrapy doesn't do pagination despite setting rules.
However, if change start_url to http://bitcoin.travel/listing-category/bitcoin-hotels-and-travel/ and comment out the parse_start_url i get more items scraped for the above page.
My target is to scrape all categories. Please any idea what am doing wrong?
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bitcointravel.items import BitcointravelItem
class BitcoinSpider(CrawlSpider):
name = "bitcoin"
allowed_domains = ["bitcoin.travel"]
start_urls = [
"http://bitcoin.travel/categories/"
]
rules = (
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(LinkExtractor(allow=('.+/page/\d+/$'), restrict_xpaths=('//a[@class="next page-numbers"]'),),
callback='parse_items', follow=True),
)
def parse_start_url(self, response):
for sel in response.xpath("//ul[@class='maincat-list']/li"):
url = sel.xpath('a/@href').extract()[0]
if url == 'http://bitcoin.travel/listing-category/bitcoin-hotels-and-travel/':
# url = 'http://bitcoin.travel/listing-category/bitcoin-hotels-and-travel/'
yield scrapy.Request(url, callback=self.parse_items)
def parse_items(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
for sel in response.xpath("//div[@class='grido']"):
item = BitcointravelItem()
item['name'] = sel.xpath('a/@title').extract()
item['website'] = sel.xpath('a/@href').extract()
yield item
This is the result
{'downloader/request_bytes': 574,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 98877,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'dupefilter/filtered': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 2, 15, 13, 44, 17, 37859),
'item_scraped_count': 24,
'log_count/DEBUG': 28,
'log_count/INFO': 8,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2016, 2, 15, 13, 44, 11, 250892)}
2016-02-15 14:44:17 [scrapy] INFO: Spider closed (finished)
Item count is suppose to be 55 not 24