Learning about Scrapy and XPaths and running into a few problems. Most recently, that: scrapy crawl GCSpider -o items.csv -t csv returns a CSV file that contains (most) of the data I want, but with scrapy selector code muddled in with the data (see screenshot).
Another point of confusion is that the column headers are ordered differently from the order set for scraping & in items. Why is this?
The spider code is as follows:
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scraper_app.items import GenomeCanadaGrants
class GCSpider(Spider):
"""Spider for Genome Canada Awards page."""
name = 'GCSpider'
allowed_domains = ["http://genomereports.ca"]
start_urls= ["http://genomereports.ca/section.php?Action=List2&Lang=En&addnew=&Report=consolidated_commitments.php&Report_Text=Funding+Commitments&Nav=Section&ID=3&Login=&Password=&Consolidated_Centre=ALL&Consolidated_Category=ALL&Consolidated_Sector=ALL&Consolidated_Competition=ALL&Consolidated_FY=ALL&Consolidated_Status=ALL"]
def parse(self, response):
sel = Selector(response)
grants = sel.css('div#content_frame table tr')
items = []
for response in grants:
item = GenomeCanadaGrants()
item['Province'] = response.xpath('.//th[1]//text()').extract() # Column Header: CENTRE
item['Sector'] = response.xpath('.//th[2]//text()').extract() # Column Header: SECTOR
item['Fund'] = response.xpath('.//th[3]//text()').extract() # Column Header: PROGRAM & Fiscal Yr Awarded
item['Date'] = response.xpath('.//th[3]//text()').re('\d+\d-\d+\d') # Column Header: PROGRAM & Fiscal Yr Awarded
item['Status'] = response.xpath('.//th[4]//text()').extract # Column Header: STATUS
item['Principal_Investigator'] = response.xpath('.//th[5]//text()').extract() # Column Header: PROJECT LEADER(S)
item['Project_Title'] = response.xpath(".//th[6]//text()").extract # Column Header: PROJECT TITLE
item['Additional_Information'] = response.xpath(".//th[6]//a[@href='url']").extract # Link to PDF with Project Details
item['Amount'] = response.xpath('.//th[7]//text()').extract # Column Header: APPROVED BUDGET
item['GC_Contribution'] = response.xpath('.//th[8]//text()').extract # Column Header: GC CONTRIBUTION
# Perhaps this line should be yield item?
items.append(item)
return items