0

So i have a scrapy spider as follows:

class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
    'http://example.com'
]

def parse(self, response):
    for subject in response.css('subject'):

        subject_name = subject.css('subject::text').extract_first().strip()
        subject_link = subject.css('subject::attr(href)').extract_first().strip()
        subject_id = subject.css('subject::attr(id)').extract_first().strip()

        if subject_link is not None:
            subject_data = scrapy.Request(subject_link, callback=self.parse_course)

        yield {
            'subject_name': subject_name,
            'subject_link': subject_link,
            'subject_id': subject_id,
            'subject_data': subject_data,
        }


def parse_course(self, response):

    subject_id = response.css('::attr(id)').extract_first().strip()

    for course in response.css('course'):

        course_name = course.css('course::text').extract_first().strip()
        course_link = course.css('course::attr(href)').extract_first().strip()
        course_id = course.css('course::attr(id)').extract_first().strip()

        if course_link is not None:
            course_data = scrapy.Request(course_link, callback=self.parse_class)

        yield {
            'course_name': course_name,
            'course_link': course_link,
            'course_id': subject_id + " " + course_id,
            'course_data': course_data,
        }

def parse_class(self, response):

    course_id = response.css('::attr(id)').extract_first().strip()

    for section in response.css('section'):
        section_name = section.css('section::text').extract_first().strip()
        section_link = section.css('section::attr(href)').extract_first().strip()

        yield {
            'section_name': section_name,
            'section_link': section_link,
            'course_id': course_id,
        }

I'd like to get an output json file that has a tree structure like so:

{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data": 
  {"course_id": "...", "course_link": "...", "course_name": "...", "course_data": 
    {"course_id": "...", "section_link": "...", "section_name": "..."}
  }
}

However i only get this:

{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}

From my understanding this is because the yield code didn't get executed yet. How would i go about calling a equivalent of "scrapy crawl courses -o courses.json" that fully calls all requests? If that's not possible out-of-the-box how can i do this myself? Can i later import the json in a python file and run http://example.com/something> and the following ones somehow?

I know there's a lot of code, but it should clarify. Thanks for your help!

Max Smith
  • 925
  • 1
  • 14
  • 25

1 Answers1

2

I see 2 ways of doing this:

  1. either build the data incrementally, and pass the data to each callback using Request.meta dict. See Passing additional data to callback functions

or

  1. use something like scrapy-inline-requests (to be tested)

Method 1.

class CoursesSpider(scrapy.Spider):
    name = "courses"
    start_urls = [
        'http://example.com'
    ]

    def parse(self, response):
        for subject in response.css('subject'):

            subject_name = subject.css('subject::text').extract_first().strip()
            subject_link = subject.css('subject::attr(href)').extract_first().strip()
            subject_id = subject.css('subject::attr(id)').extract_first().strip()

            if subject_link is not None:
                subject_data = scrapy.Request(subject_link, callback=self.parse_course)

            # build a dict with the info we have so far
            subject_info = {
                'subject_name': subject_name,
                'subject_link': subject_link,
                'subject_id': subject_id,
            }
            # add this to the Request's meta dict
            subject_data.meta['subject_info'] = subject_info

            # ask Scrapy to fetch additional data
            yield subject_data

    def parse_course(self, response):

        # get back the data that was passed previously
        subject_info = response.request.meta['subject_info']

        subject_id = response.css('::attr(id)').extract_first().strip()

        for course in response.css('course'):

            course_name = course.css('course::text').extract_first().strip()
            course_link = course.css('course::attr(href)').extract_first().strip()
            course_id = course.css('course::attr(id)').extract_first().strip()

            if course_link is not None:
                course_data = scrapy.Request(course_link, callback=self.parse_class)

            # build a dict with the data in this page
            # + the data scraped previously
            course_info = {
                'course_name': course_name,
                'course_link': course_link,
                'course_id': subject_id + " " + course_id,
                'subject_info': subject_info,
            }

            # pass that data to the next callback
            course_data.meta['course_info'] = subject_info

            # fetch the class page
            yield course_data

    def parse_class(self, response):

        # get course data from previous callbacks
        course_info = response.request.meta['course_info']

        course_id = response.css('::attr(id)').extract_first().strip()

        for section in response.css('section'):
            section_name = section.css('section::text').extract_first().strip()
            section_link = section.css('section::attr(href)').extract_first().strip()

            yield {
                'section_name': section_name,
                'section_link': section_link,
                'course_id': course_id,
                'course_info': course_info
            }

So you will not get subjects containing courses, themselves containing sections, rather sections, each having info on what courses they belong to, themselves having info on which subject they relate to.

Method 2. (Warning: I have not tested this in practice but it may work)

from inline_requests import inline_requests

class CoursesSpider(scrapy.Spider):
    name = "courses"
    start_urls = [
        'http://example.com'
    ]

    # this decorator is important
    @inline_requests
    def parse(self, response):

        for subject in response.css('subject'):

            subject_name = subject.css('subject::text').extract_first().strip()
            subject_link = subject.css('subject::attr(href)').extract_first().strip()
            subject_id = subject.css('subject::attr(id)').extract_first().strip()

            # this list will collect information on courses for this subject
            subject_data = []

            if subject_link is not None:
                try:
                    # you ask scrapy to fetch the page
                    # but you do not set a callback
                    subject_response = yield scrapy.Request(subject_link)
                    # and you get a Response to work on when it's fetched,
                    # without going through a callback

                    subject_id = subject_response.css('::attr(id)').extract_first().strip()

                    for course in subject_response.css('course'):

                        course_name = course.css('course::text').extract_first().strip()
                        course_link = course.css('course::attr(href)').extract_first().strip()
                        course_id = course.css('course::attr(id)').extract_first().strip()

                        # this list will collect information on sections for this course
                        course_data = []
                        if course_link is not None:
                            try:
                                # same thing here, you ask Scrapy to fetch a Response
                                course_response = yield scrapy.Request(course_link)

                                course_id = course_response.css('::attr(id)').extract_first().strip()

                                for section in course_response.css('section'):
                                    section_name = section.css('section::text').extract_first().strip()
                                    section_link = section.css('section::attr(href)').extract_first().strip()

                                    # add each section item
                                    course_data.append(
                                        {
                                            'section_name': section_name,
                                            'section_link': section_link,
                                            'course_id': course_id,
                                        }
                                    )

                            except:
                                raise

                        # add each course item
                        subject_data.append(
                            {
                                'course_name': course_name,
                                'course_link': course_link,
                                'course_id': subject_id + " " + course_id,
                                'course_data': course_data,
                            }
                        )

                except:
                    raise


            yield {
                'subject_name': subject_name,
                'subject_link': subject_link,
                'subject_id': subject_id,
                'subject_data': subject_data,
            }
paul trmbrth
  • 20,518
  • 4
  • 53
  • 66
  • Awesome! I've tested both however, I noticed that the first approach produces the wrong format ( ie: {"course_id": "...", "course_info": {"subject_id": "...", "subject_name": "...", "subject_link": "..."} , "section_link": "...", "section_name": "..."} ) and that there's only one course per subject and this shouldnt be the case. The second approach works well. However it is very slow. I was drawn to scrapy because of its asynchronous feautures, it is fast because it threads the jobs. The second code seems like it doesn't do this. How can this be improved? thx! – Max Smith Dec 06 '16 at 21:29
  • With inline-requests, you indeed process responses serially within each callback call. You're basically back to a sequential processing model (which you'd have with python-requests for example) and not using the idiomatic asynchronous/callback way to use scrapy. But you build your items in a more natural way. If you want the "speed" of asynchronous networking (having multiple HTTP calls at the same time), then you'll have to build sub-items, each with some ID key, and reconstruct the whole tree as a post-processing step (at least, that's what I would do) – paul trmbrth Dec 14 '16 at 11:47