Scrapy-Selenium Pagination

Question

Can anyone help me? I'm practicing and I can't understand what I did wrong on pagination! It only returns the first page to me and sometimes an error comes up. When it works, it just returns the first page.

"The source list for the Content Security Policy directive 'frame-src' contains an invalid source '*trackcmp.net' It will be ignored", source: https://naturaldaterra.com.br/hortifruti.html?page=2"

import scrapy
from scrapy_selenium import SeleniumRequest

class ComputerdealsSpider(scrapy.Spider):
    name = 'produtos'
    
    def start_requests(self):
        yield SeleniumRequest(
            url='https://naturaldaterra.com.br/hortifruti.html?page=1',
            wait_time=3,
            callback=self.parse
        )

    def parse(self, response):

        for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
            yield {
                'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
                'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),

            }
            
        next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()
        if next_page:
            absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
            yield SeleniumRequest(
                url=absolute_url,
                wait_time=3,
                callback=self.parse
            )

score 2 · Accepted Answer · answered Oct 19 '21 at 09:12

The problem is that your xpath selector returns None instead of the next page number. Consider changing it from

next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()

to

next_page = response.xpath("//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()

For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using scrapy-playwright

import scrapy
from scrapy.crawler import CrawlerProcess


class ComputerdealsSpider(scrapy.Spider):
    name = 'produtos'

    def start_requests(self):

        yield scrapy.Request(
            url='https://naturaldaterra.com.br/hortifruti.html?page=1',
            meta={"playwright": True}
        )

    def parse(self, response):
        for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
            yield {
                'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
                'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),
            }
        # scrape next page
        next_page = response.xpath(
            "//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
        yield scrapy.Request(
            url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
            meta={"playwright": True}
        )


if __name__ == "__main__":
    process = CrawlerProcess(settings={
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        }, })
    process.crawl(ComputerdealsSpider)
    process.start()

Scrapy-Selenium Pagination

1 Answers1