I'm a beginner learning how to webscrape using Scrapy in Python. Can someone point out what's wrong? My goal is to scrape all the subsequent pages.
from indeed.items import IndeedItem
import scrapy
class IndeedSpider(scrapy.Spider):
name = "ind"
allowed_domains = ["https://www.indeed.com"]
start_urls = ['https://www.indeed.com/jobs?q=analytics+intern&start=']
def parse(self, response):
job_card = response.css('.jobsearch-SerpJobCard')
for job in job_card:
item = IndeedItem()
job_title = job.css('.jobtitle::attr(title)').extract()
company_name = job.css('.company .turnstileLink::text').extract()
if not company_name:
company_name = job.css('span.company::text').extract()
item['job_title'] = job_title
item['company_name'] = company_name
yield item
next_page_extension = response.css('ul.pagination-list a::attr(href)').get()
if next_page_extension is not None:
next_page = response.urljoin(next_page_extension)
yield scrapy.Request(next_page, callback=self.parse)