1

I create a project with scrapy and save data to my mongodb. It can work.

Here is my code:

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import time

# scrapy api imports
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# I import a lots of spider file here.
from Tainan.Madou import Madou
# from ... import ...
# from ... import ...

# Spider Array: add spider into array
CrawlersArray = [ Madou ] 

class MoviesSpider(scrapy.Spider):
    name = 'movies'
    allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
    start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']


process = CrawlerProcess(get_project_settings())

for spider in CrawlersArray:
    process.crawl(spider)

process.start()

Here is my Madou spide, i have a lots of spider just like Madou, if i don't add if __name__ == '__main__': i can run all of my spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request

from TainanItem import Tainan, MovieReleased
# 麻豆戲院
class Madou(scrapy.Spider):
    name = 'Madou'
    allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
    start_urls = ['https://movies.yahoo.com.tw/theater_result.html/id=68']

    global tainan, movieReleased
    tainan = Tainan()
    movieReleased = MovieReleased()

    global detailDict
    detailDict = {}
    global locationDetail
    locationDetail = {}
    global lonAndLatArray
    global cnNameArray, enNameArray, releasedTimeArray, versionTypeArray, movieStyleArray, moviePhotoArray, movieContentArray, nextPageHrefArray
    global movieDateArray, movieTimeArray, movieStillsArray, movieActorCnArray, movieActorPhotoArray
    cnNameArray = []
    enNameArray = []
    versionTypeArray = []
    movieStyleArray = []
    releasedTimeArray = []
    moviePhotoArray = []
    movieContentArray = []
    nextPageHrefArray = []
    movieDateArray = []
    movieTimeArray = []
    movieStillsArray = []
    movieActorCnArray = []
    movieActorPhotoArray = []
    lonAndLatArray = []
    global dataLen, countLen
    dataLen = 0
    countLen = 0

    def parse(self, response):

        global tainan
        global movieReleased, detailDict, locationDetail
        global lonAndLatArray
        global cnNameArray, enNameArray, versionTypeArray, movieStyleArray, releasedTimeArray, moviePhotoArray, movieContentArray
        global movieDateArray, movieTimeArray, movieStillsArray, movieActorCnArray, movieActorPhotoArray
        global nextPageHrefArray
        global dataLen
        tainan['theater'] = 'Madou'
        tainan['theaterCn'] = '麻豆戲院'
        tainan['address'] = '台南縣麻豆鎮興中路106號3樓'
        tainan['phone'] = '06-5722159'
        lonAndLatArray = [float(120.251206), float(23.183880)]

        htmlNodes = response.xpath('//div[@class="release_info_text"]')
        for htmlNode in htmlNodes:
            cnName = htmlNode.xpath('.//div[@class="theaterlist_name"]/a/text()').extract_first()
            enName = htmlNode.xpath('.//div[@class="en"]/a/text()').extract_first()
            versionType = htmlNode.xpath('.//div[@class="tapR"]/text()').extract_first()
            releasedTime = htmlNode.xpath('.//ul[@class="theater_time"]/li/text()').extract()

            cnNameArray.append(cnName)
            enNameArray.append(enName)
            versionTypeArray.append(versionType)
            releasedTimeArray.append(releasedTime)

        i = 1000
        dataLen = len(response.xpath('//div[@class="release_foto"]'))
        photoNodes = response.xpath('//div[@class="release_foto"]')
        for photoNode in photoNodes:
            contentHref = photoNode.xpath('.//a/@href').extract_first()
            yield Request(contentHref, callback=self.parse_page, priority = i, dont_filter=True)
            i -= 1
            photoHref = photoNode.xpath('.//a/img/@src').extract_first()
            moviePhotoArray.append(photoHref)

        detailDict.update({
             'cnName': cnNameArray,
             'enName': enNameArray,
             'movieContent': movieContentArray,
             'versionType': versionTypeArray,
             'movieStyle': movieStyleArray,
             'releasedTime': releasedTimeArray,
             'moviePhoto': moviePhotoArray,
             'movieDate': movieDateArray,
             'movieTime': movieTimeArray,
             'movieStills': movieStillsArray,
             'movieActorCn': movieActorCnArray,
             'movieActorPhoto': movieActorPhotoArray})

        locationDetail.update({
            'type': "Point",
            'coordinates': lonAndLatArray
        })

        movieReleased['film'] = dict(detailDict)
        tainan['geometry'] = dict(locationDetail)
        tainan['movie'] = dict(movieReleased)

    def parse_page(self, response):

        global movieContentArray, countLen, dataLen
        global movieDateArray, movieTimeArray, movieStillsArray, movieStyleArray, movieActorCnArray, movieActorPhotoArray
        movieContent = response.xpath('//div[@class="gray_infobox_inner"]/span/text()').extract_first()
        movieDate = response.xpath('//*[@class="movie_intro_info_r"]/span/text()')[0].extract()
        movieTime = response.xpath('//*[@class="movie_intro_info_r"]/span/text()')[1].extract()
        movieStills = response.xpath('//ul[@class="trailer_list imglist"]//div[@class="foto"]/img/@src').extract()
        movieStyle = response.xpath('//div[@class="level_name_box"]//div[@class="level_name"]/a/text()').extract()
        movieActorCn = response.xpath('//ul[@class="trailer_list alist starlist"]/li/a//div[@class="fotoinner"]/img/@title').extract()
        movieActorPhoto = response.xpath('//ul[@class="trailer_list alist starlist"]/li/a//div[@class="fotoinner"]/img/@src').extract()
        movieContentArray.append(movieContent)
        movieDateArray.append(movieDate)
        movieTimeArray.append(movieTime)
        movieStillsArray.append(movieStills)
        movieStyleArray.append(movieStyle)
        movieActorCnArray.append(movieActorCn)
        movieActorPhotoArray.append(movieActorPhoto)

        countLen += 1
        if countLen == dataLen:
            yield tainan

But when i want to deploy my project to Scrapinghub , i get the error

Exceeded container timeout 60s

I find solution from github https://github.com/scrapinghub/shub/issues/273

I am not sure how to use first solution , so i try second solution just like questioner.

I fix the code like this:

if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())

    for spider in CrawlersArray:
        process.crawl(spider)

    process.start()

It can deploy project to Scrapinghub succeed, but when i run the project i find no one spider run.

Why ? I can't figure it out.

Any help would be appreciated . Thanks in advance.

Here is my terminal information when i run the project:

File "/Library/Python/2.7/site-packages/scrapy/spiders/__init__.py", line 90, in parse
    raise NotImplementedError
NotImplementedError
2018-03-18 10:40:25 [scrapy.core.engine] INFO: Closing spider (finished)
2018-03-18 10:40:25 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 828,
 'downloader/request_count': 3,
 'downloader/request_method_count/GET': 3,
 'downloader/response_bytes': 87445,
 'downloader/response_count': 3,
 'downloader/response_status_count/200': 1,
 'downloader/response_status_count/301': 2,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2018, 3, 18, 2, 40, 25, 666163),
 'log_count/DEBUG': 4,
 'log_count/ERROR': 1,
 'log_count/INFO': 7,
 'memusage/max': 53428224,
 'memusage/startup': 53424128,
 'response_received_count': 1,
 'scheduler/dequeued': 3,
 'scheduler/dequeued/memory': 3,
 'scheduler/enqueued': 3,
 'scheduler/enqueued/memory': 3,
 'spider_exceptions/NotImplementedError': 1,
 'start_time': datetime.datetime(2018, 3, 18, 2, 40, 18, 487308)}
2018-03-18 10:40:25 [scrapy.core.engine] INFO: Spider closed (finished) 

Try to fix:

class MoviesSpider(scrapy.Spider):
    name = 'movies'
    allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
    start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']

    def parse(self, response):
        print("inside parse")

if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())

    for spider in CrawlersArray:
        process.crawl(spider)

    process.start()

log:

2018-03-18 17:31:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://movies.yahoo.com.tw/movie_thisweek.html/> (referer: None)
inside parse
2018-03-18 17:31:34 [scrapy.core.engine] INFO: Closing spider (finished)
2018-03-18 17:31:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
Morton
  • 5,380
  • 18
  • 63
  • 118

1 Answers1

1

From the logs it clear that you miss a parse callback

class MoviesSpider(scrapy.Spider):
    name = 'movies'
    allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
    start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']

    def parse(self, response):
        print("inside parse")

In the parse callback function, you parse the response (web page) and return either dicts with extracted data, Item objects, Request objects, or an iterable of these objects. Those Requests will also contain a callback (maybe the same) and will then be downloaded by Scrapy and then their response handled by the specified callback.

UPDATE: whole code

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import time

# scrapy api imports
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# I import a lots of spider file here.
# from myfile import project

# Spider Array: add spider into array
CrawlersArray = [ ... ] 

class MoviesSpider(scrapy.Spider):
    name = 'movies'
    allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
    start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']

    def parse(self, response):
        print("inside parse")

if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())

    for spider in CrawlersArray:
        process.crawl(spider)

    process.start()
Ami Hollander
  • 2,435
  • 3
  • 29
  • 47
  • But it just my run spider entrance, what i really run the spider is from process.start() , i don't know why i follow the solution `if __name__ == '__main__':` my process is not running any more. – Morton Mar 18 '18 at 06:52
  • you must have a main to run the spider, i will update my answer – Ami Hollander Mar 18 '18 at 08:39
  • Thank you agian, i modify the code , i just saw `inseide parse` but process still won't work. (i update the log on my question) – Morton Mar 18 '18 at 09:42
  • which process? scrapy spider run and ended successfully. `inside parse` is been called from inside the spider process – Ami Hollander Mar 18 '18 at 10:59
  • I mean my `process.start()` still doesn't work. `CrawlersArray` include all of my spiders, all they have def `parse(self, response):` function – Morton Mar 18 '18 at 13:09
  • what is the error? if you have the output inside `parse` function is run and finished. can you show the other spiders code? and the new errors? – Ami Hollander Mar 18 '18 at 13:29
  • Thanks for pay attention my response, i have update on of my spiders `Madou` in my question , please take a look. – Morton Mar 18 '18 at 14:06
  • I am not familiar with scrapyhub... maybe you need to tell they workers where is your main? another solution for you is to use multiprocessing to run your spiders and not relay on scrapy process runner – Ami Hollander Mar 19 '18 at 08:30
  • Its kind of difficult for me to use multiprocessing, i am not sure its a technology problem but i will try to ask Scrapinghub team. Thank you. – Morton Mar 19 '18 at 14:15