0

I can't seem to get my scraper to follow the links to def parse_jobs, I only get a single result and it won't interate for each href that I've grabbed. I get a lot of output that isn't meaningful, and whilst I get a 200 I don't actually get much info returned. I'm thinking it may be my xpaths, or how I've set up the requests for my scraper?

UPDATE: I have fixed the issue for the single result, I was missing the parentheses for a get. However, I can only scrape a single page and the scraper won't go into the next pages to scrape any info.

Here is my scraper:

import hashlib
from pathlib import Path
from scrapy.crawler import CrawlerProcess
import scrapy
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
from pathlib import Path
from typing import Generator, Optional


class JobSpider(scrapy.Spider):
    name = 'job_play'
    start_urls = ['https://jobsite.co.uk/jobs/Degree-Accounting-and-Finance']
    
    custom_settings = {
        'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15'
    }
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url = url,
                callback = self.parse,
                dont_filter = True,
                meta= dict(
                    playwright = True,
                    playwright_include_page = True,
                    playwright_page_coroutines = [
                        PageCoroutine('wait_for_selector', 'div.row.job-results-row')
                        ]
                )
            )
    def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
        last_page = response.xpath('//div[@class="row job-results-row"]//a[5]//text()').extract_first()
        last_page=int(last_page)
        for page in range(2, last_page + 1):
            yield response.follow(f"https://jobsite.co.uk/jobs?page={page}&action=paging_next.html", cb_kwargs={"current_page": page})

        current_page = current_page or 1
        for jobs in response.xpath("//article//div//div[position() mod 7 = 6]/a//@href"):
            yield response.follow(
                jobs,
                callback=self.parse_jobs,
                meta={
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": f"page-{current_page}",
                },
            )

    async def parse_jobs(self, response):
        url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest()
        page = response.meta["playwright_page"]
        await page.screenshot(
            path=Path(__file__).parent / "job_test" / f"{url_sha256}.png", full_page=True
        )
        await page.close()
        yield {
            "url": response.url,
            "title": response.xpath("//h1[@class='brand-font']//text()").get(),
            "price": response.xpath("//li[@class='salary icon']//div//text()").get(),
            "organisation": response.xpath("//a[@id='companyJobsLink']//text()").get(),
            "image": f"job_test/{url_sha256}.png",
        }
if __name__ == "__main__":
    process = CrawlerProcess(
        settings={
            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
            "DOWNLOAD_HANDLERS": {
                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            },
            "CONCURRENT_REQUESTS": 32,
            "CLOSESPIDER_ITEMCOUNT": 100,
            "FEED_URI":'jobs.jl',
            "FEED_FORMAT":'jsonlines',
        }
    )
    process.crawl(JobSpider)
    process.start()
joe_bill.dollar
  • 374
  • 1
  • 9
  • 1
    There are many issues with your code. First, the default `parse` method only takes in a response object. Second the start_url is loaded by javascript so you should use `playwrigtht` to load it as well. Third, you have not enabled the `https` playwright download handler yet you are crawling a `https` site. Fourth, you should not set logging to warning before you confirm that your code is working. Fifth, your xpath selectors are incorrect and you should not use dynamic classes for selecting content. etc – msenior_ Jan 06 '22 at 03:51
  • @msenior_ I've deleted the previous comments to save space. I've updated the script with better xpaths. I get returned 21/23 results for a page. However, I cannot get to the next page, any ideas how I can implement that? – joe_bill.dollar Jan 06 '22 at 16:31

0 Answers0