-1

I am trying to scrape from the website https://tonaton.com/en/ads/ghana/electronics. There is a "next" button that I want to click and scrape the contents. The problem is the xpath or css selector of that button does not return any value in neither the scrapy shell nor splash and I am stuck. I can't get in to scrape what I need to. Please any help? This is how far I have been able to come but I'm not getting the right results.

# -*- coding: utf-8 -*-

import scrapy import scrapy_selenium from scrapy_selenium import SeleniumRequest

class VisionSpider(scrapy.Spider): name = 'vision'

def start_requests(self):
    yield SeleniumRequest(
        url= 'https://tonaton.com',
        wait_time=3,
        screenshot=True,
        callback=self.parse
    )


def parse(self, response): 
    businesses = response.xpath(
        "//a[@class='link--1t8hM gtm-home-category-link-click']")
    for business in businesses:
        link = business.xpath(".//@href").get()
        category = business.xpath(".//div[2]/p/text()").get()

        yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})


def parse_business(self, response):
    
    category = response.request.meta['business_category']
    rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
    for row in rows:
        new_link = row.xpath(".//@href").get()

        yield response.follow(url=new_link, callback=self.next_parse, meta={'business_category': category})

    next_page = response.xpath("//div[@class = 'action-button--1O8tU']")
    if next_page:
        button = next_page.click()
        yield SeleniumRequest(
            url=button,
            wait_time=3,
            callback=self.parse
        )



def next_parse(self, response):
    category = response.request.meta['business_category']
    lines = response.xpath("//a[@class='member-link--IzDly gtm-visit-shop']")
    for line in lines:
        next_link = line.xpath(".//@href").get()

        yield response.follow(url=next_link, callback=self.another_parse, meta={'business_category': category})

def another_parse(self, response):
    category = response.request.meta['business_category']
    button = response.xpath("//button[@class = 'contact-section--1qlvP gtm-show-number']").click()
    
    yield response.follow(url=button, callback=self.new_parse, meta={'business_category': category})


def new_parse(self, response):
    category = response.request.meta['business_category']
    times = response.xpath("//div[@class='info-container--3pMhK']")
    for time in times:
        name = time.xpath(".//div/span/text()").get()
        location = time.xpath(".//div/div/div/span/text()").get()
        phone = time.xpath(".//div[3]/div/button/div[2]/div/text()").get()

        yield {
            'business_category': category,
            'business_name': name,
            'phone': phone,
            'location': location
        }
  • 1
    What do you need to scrape? I clicked the "next" button and it changed the entire URL of the website in a predictable way: `?page=2`. Could you not grab the information needed on page one and keep changing URL until you reach the last page? – Isolated Aug 30 '21 at 16:21
  • I understand I can do that. But is that not going to be a lot of work? Or is there a way to automate it to iterate through all the next pages? – Danny Stringz Aug 30 '21 at 19:29

1 Answers1

0

I have tried this but the pagination is still not working. Also where I click the call button to scrape, it takes quite a long time to return the desired output. Is there a way to make it faster?

class VisionSpider(scrapy.Spider):
    name = 'vision'
    main_domains = ['tonaton.com']
    start_urls =['https://tonaton.com']

def parse(self, response):   
    businesses = response.xpath("//a[@class='link--1t8hM gtm-home-category-link-click'][1]")
    for business in businesses:
        link = business.xpath(".//@href").get()
        category = business.xpath(".//div[2]/p/text()").get()

        yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})


def parse_business(self, response):
    category = response.request.meta['business_category']
    rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
    for row in rows:
        new_link = row.xpath(".//@href").get()
        if new_link:

            yield response.follow(url=new_link, callback=self.new_parse, meta={'business_category': category, 'newlink':new_link})

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    chrome_path = which("chromedriver")
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window

    next_page = wait(driver, 300).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//div[@class='icon--3D09z extra-small--_AIuZ arrow-right--17oRn']"))) 
    if  next_page:
        next_page.click()

        yield SeleniumRequest(callback=self.parse_business)
    
    driver.close()



def new_parse(self, response):
    category = response.request.meta['business_category']
    chrome_options = Options()
    chrome_options.add_argument("--headless")
# options=chrome_options
    chrome_path = which("chromedriver")  
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
    driver.get(response.url)
    driver.maximize_window
    category = response.request.meta['business_category']

    call_button = wait(driver, 500).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='call-button--3uvWj']")))
    call_button.click()
    
    html = driver.page_source
    resp = Selector(text=html)

    driver.close()

    contacts = resp.xpath("//div[@class='call-button--3uvWj']/div[1]")
    for contact in contacts:
        phone = contact.xpath(".//text()").get()
    times = resp.xpath("//div[@class='details-section--2ggRy']")
    for time in times:
        name = time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/text()").get()
        if name is None:
            name =time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/div/text()").get()

        location = time.xpath(".//div/div/div/span/a/span/text()[1]").get()
        region = time.xpath(".//div/div/div/span/a[2]/span/text()").get()

        yield {
            'business_category': category,
            'business_name': name,
            'phone': phone,
            'region':region,
            'location': location
        }