So i have a scrapy spider as follows:
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
if subject_link is not None:
subject_data = scrapy.Request(subject_link, callback=self.parse_course)
yield {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
'subject_data': subject_data,
}
def parse_course(self, response):
subject_id = response.css('::attr(id)').extract_first().strip()
for course in response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
if course_link is not None:
course_data = scrapy.Request(course_link, callback=self.parse_class)
yield {
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'course_data': course_data,
}
def parse_class(self, response):
course_id = response.css('::attr(id)').extract_first().strip()
for section in response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
yield {
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
}
I'd like to get an output json file that has a tree structure like so:
{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data":
{"course_id": "...", "course_link": "...", "course_name": "...", "course_data":
{"course_id": "...", "section_link": "...", "section_name": "..."}
}
}
However i only get this:
{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}
From my understanding this is because the yield code didn't get executed yet. How would i go about calling a equivalent of "scrapy crawl courses -o courses.json" that fully calls all requests? If that's not possible out-of-the-box how can i do this myself? Can i later import the json in a python file and run http://example.com/something> and the following ones somehow?
I know there's a lot of code, but it should clarify. Thanks for your help!