I've written a Scrapy spider that I am trying to run from a python script located in another directory. The code I'm using from the docs seems to run the spider, but when I check the postgresql table, it hasn't been created. The spider only properly pipelines the scraped data if I use the scrapy crawl command. I've tried placing the script in the directory right above the scrapy project and also in the same directory as the config file and neither seem to be creating the table.
The code for the script is below followed by the code for the spider. I think the problem involves the directory in which the script should be place and/or the code that I use within the spider file to enable the spider to be ran from a script, but I'm not sure. Does it look like there is a problem with the function that is being called in the script or is there something that needs to be changed within the settings file? I can provide the code for the pipelines file if necessary, thanks.
Script file (only 3 lines)
from ticket_city_scraper import *
from ticket_city_scraper.spiders import tc_spider
tc_spider.spiderCrawl()
Spider file
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from ticket_city_scraper.items import ComparatorItem
from urlparse import urljoin
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
bandname = raw_input("Enter bandname\n")
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[@class = "vevent"]'
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
self.start_urls = [tc_url]
#return tc_url
tickets_list_xpath = './/div[@class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
#loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
#loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def spiderCrawl():
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(MySpider3)
process.start()