I can't seem to get my scraper to follow the links to def parse_jobs
, I only get a single result and it won't interate for each href
that I've grabbed. I get a lot of output that isn't meaningful, and whilst I get a 200 I don't actually get much info returned. I'm thinking it may be my xpaths, or how I've set up the requests for my scraper?
UPDATE:
I have fixed the issue for the single result, I was missing the parentheses for a get
. However, I can only scrape a single page and the scraper won't go into the next pages to scrape any info.
Here is my scraper:
import hashlib
from pathlib import Path
from scrapy.crawler import CrawlerProcess
import scrapy
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
from pathlib import Path
from typing import Generator, Optional
class JobSpider(scrapy.Spider):
name = 'job_play'
start_urls = ['https://jobsite.co.uk/jobs/Degree-Accounting-and-Finance']
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Safari/605.1.15'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url = url,
callback = self.parse,
dont_filter = True,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine('wait_for_selector', 'div.row.job-results-row')
]
)
)
def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
last_page = response.xpath('//div[@class="row job-results-row"]//a[5]//text()').extract_first()
last_page=int(last_page)
for page in range(2, last_page + 1):
yield response.follow(f"https://jobsite.co.uk/jobs?page={page}&action=paging_next.html", cb_kwargs={"current_page": page})
current_page = current_page or 1
for jobs in response.xpath("//article//div//div[position() mod 7 = 6]/a//@href"):
yield response.follow(
jobs,
callback=self.parse_jobs,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_context": f"page-{current_page}",
},
)
async def parse_jobs(self, response):
url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest()
page = response.meta["playwright_page"]
await page.screenshot(
path=Path(__file__).parent / "job_test" / f"{url_sha256}.png", full_page=True
)
await page.close()
yield {
"url": response.url,
"title": response.xpath("//h1[@class='brand-font']//text()").get(),
"price": response.xpath("//li[@class='salary icon']//div//text()").get(),
"organisation": response.xpath("//a[@id='companyJobsLink']//text()").get(),
"image": f"job_test/{url_sha256}.png",
}
if __name__ == "__main__":
process = CrawlerProcess(
settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"CONCURRENT_REQUESTS": 32,
"CLOSESPIDER_ITEMCOUNT": 100,
"FEED_URI":'jobs.jl',
"FEED_FORMAT":'jsonlines',
}
)
process.crawl(JobSpider)
process.start()