empty scraper output while individual hxs.select works?

Question

mainfile

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from bloggerx.items import BloggerxItem
from scrapy.spider import BaseSpider

class BloggerxSpider(BaseSpider):
    name = 'bloggerx'
    allowed_domains = ['abcr.com']
    start_urls = ['http://www.abcr.com/profile/07372831905432746031']
    def parse(self,response):
        hxs = HtmlXPathSelector(response)
        item = BloggerxItem()
        item['gender'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Gender")]/following-sibling::node()/text()').extract()
        item['blogger_since'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[2]/text()').re('\d+')
        item['profile_views'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[3]/text()').re('\d+')
        item['industry'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Industry")]/following-sibling::node()/span/a/text()').extract()
        item['occupation'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Occupation")]/following-sibling::node()/span/a/text()').extract()
        item['locality'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[@class="locality"]/a/text()').extract()
        item['region'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[@class="region"]/a/text()').extract()
        item['country'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[@class="country-name"]/a/text()').extract()
        item['introduction'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Introduction")]/following-sibling::node()/text()').extract()
        item['interests'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Interests")]/following-sibling::node()/span/a/text()').extract()
        item['email1'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li/script/text()').re('[\w.]+@[\w.]+[com]')
        item['email2'] = hxs.select('/html/body/div[2]/div/div[2]/div/ul/li[3]/div/text()').extract()
        item['website'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li[2]/a/@href').extract()
        item['films'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Films")]/following-sibling::node()/span/a/text()').extract()
        item['music'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Music")]/following-sibling::node()/span/a/text()').extract()
        item['books'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Books")]/following-sibling::node()/span/a/text()').extract()
        item['blogs_follow'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/text()').extract()
        item['blogs_follow_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/@href').extract()
        item['author_blogs'] =  hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/text()').extract()
        item['author_blogs_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/@href').extract()
        return item

item file

from scrapy.item import Item, Field

class BloggerxItem(Item):
    # define the fields for your item here like:
    # name = Field()
    gender = Field()
    blogger_since = Field()
    profile_views = Field()
    industry = Field()
    occupation = Field()
    locality = Field()
    introduction = Field()
    interests = Field()
    email1 = Field()
    website = Field()
    films = Field()
    music = Field()
    books = Field()
    region = Field()
    country = Field()
    email2 = Field()
    blogs_follow = Field()
    blogs_follow_link = Field()
    author_blogs = Field()
    author_blogs_link = Field()
    pass

output when I run : scrapy crawl bloggerx -o items.json -t json

2013-03-07 16:39:24+0530 [scrapy] INFO: Scrapy 0.16.4 started (bot: bloggerx)
2013-03-07 16:39:24+0530 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddleware, CookiesMiddleware, HttpCompressionMiddleware, ChunkedTransferMiddleware, DownloaderStats
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Enabled item pipelines: 
2013-03-07 16:39:25+0530 [bloggerx] INFO: Spider opened
2013-03-07 16:39:25+0530 [bloggerx] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6028
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Web service listening on 0.0.0.0:6085
2013-03-07 16:39:27+0530 [bloggerx] DEBUG: Crawled (200) <GET http://www.abcr.com/profile/07372831905432746031> (referer: None)
2013-03-07 16:39:27+0530 [bloggerx] INFO: Closing spider (finished)
2013-03-07 16:39:27+0530 [bloggerx] INFO: Dumping Scrapy stats:
    {'downloader/request_bytes': 249,
     'downloader/request_count': 1,
     'downloader/request_method_count/GET': 1,
     'downloader/response_bytes': 13459,
     'downloader/response_count': 1,
     'downloader/response_status_count/200': 1,
     'finish_reason': 'finished',
     'finish_time': datetime.datetime(2013, 3, 7, 11, 9, 27, 320389),
     'log_count/DEBUG': 7,
     'log_count/INFO': 4,
     'response_received_count': 1,
     'scheduler/dequeued': 1,
     'scheduler/dequeued/memory': 1,
     'scheduler/enqueued': 1,
     'scheduler/enqueued/memory': 1,
     'start_time': datetime.datetime(2013, 3, 7, 11, 9, 25, 967450)}
2013-03-07 16:39:27+0530 [bloggerx] INFO: Spider closed (finished)

Generated output file is empty and individual hxs.select statements when tried on scrapy shell works fine . Is there something silly I am doing?

I just upgraded to scrapy 0.16.4 and the code above still works. What does your settings.py file look like? — Talvalin, Mar 09 '13 at 13:04

score 1 · Answer 1 · edited Jun 03 '13 at 07:46

it seems to be late . But recently i have learnt the scrapy , as far my research goes ...

You are calling crawl spider from header and using base spider error:

from scrapy.contrib.spiders import CrawlSpider, Rule
class BloggerxSpider(BaseSpider):

after correction:

 from scrapy.contrib.spiders import **CrawlSpider**, Rule

class BloggerxSpider(**CrawlSpider**):

OR

from scrapy.spider import BaseSpider

class BloggerxSpider(BaseSpider):

iblazevic · Answer 2 · 2013-03-07T11:36:13.163

0

Instead def parse_blogger you need to put def parse.

def parse is default one for parsing in framework, and if you want to name it different you need to send your responses to that new one.

For using your own parse method you need to call is as callback, this is example when you create your own Request:

request = Request("http://something", callback=self.parse_blogger)

edited Mar 07 '13 at 11:36

answered Mar 07 '13 at 11:28

iblazevic

2,713
2
23
38

I changed it to def parse , again same issue , and when I test it using print statement in parse function , nothing is printed, it seems like parse function is never called , dont understand why!!! – Harshit Mar 07 '13 at 11:34

Talvalin · Answer 3 · 2013-03-08T11:24:18.090

0

If you're not explicitly defining rules and don't care about following links, then use a BaseSpider instead, but keep your callback named as parse.

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from bloggerx.items import BloggerxItem

class BloggerxSpider(BaseSpider):
    ...

Note that for CrawlSpiders, the documentation explicitly states that you should not name your callback parse, as that will override CrawlSpider's parse method and the spider will not crawl correctly.

edited Mar 08 '13 at 11:24

answered Mar 07 '13 at 11:49

Talvalin

7,789
2
30
40

Made changes as specified , not using any rules either, stil the same problem , its not going inside parse function – Harshit Mar 07 '13 at 11:58
I'm trying to test this and I get a 404 message. Is it possible to provide a link that can be accessed within a login? – Talvalin Mar 07 '13 at 18:31
above link is wrong cos I cant disclose link here as it might be illegal to crawl those links, replace abcr with blogger – Harshit Mar 08 '13 at 04:42
Okay, so I tested the link and your code (using a BaseSpider) works for me. Could you edit the question above to show the exact code that you're running please so that I can test it please? :) – Talvalin Mar 08 '13 at 08:28

Steven Almeroth · Answer 4 · 2013-03-08T15:15:29.913

Your log output seems weird to me as there is no entry for your start_urls, which the server responds with a 404 which Scrapy will ignore by default, so no Items will be returned. Also your spider does not declare BaseSpider which means this code will not even compile, so it seems there are some copy/paste issues going on here.

EDIT------------------

I changed the domain to blogger.com and now it returns one Item:

2013-03-08 09:02:28-0600 [scrapy] INFO: Scrapy 0.17.0 started (bot: oneoff)
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'oneoff.spiders', 'SPIDER_MODULES': ['oneoff.spiders'], 'USER_AGENT': 'Chromium OneOff 24.0.1312.56 Ubuntu 12.04 (24.0.1312.56-0ubuntu0.12.04.1)', 'BOT_NAME': 'oneoff'}
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled item pipelines:
2013-03-08 09:02:28-0600 [bloggerx] INFO: Spider opened
2013-03-08 09:02:28-0600 [bloggerx] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6024
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Web service listening on 0.0.0.0:6081
2013-03-08 09:02:28-0600 [bloggerx] DEBUG: Crawled (200) <GET http://www.blogger.com/profile/07372831905432746031> (referer: None)
2013-03-08 09:02:28-0600 [bloggerx] DEBUG: Scraped from <200 http://www.blogger.com/profile/07372831905432746031>
    {'author_blogs': [u'Inserire comunicati stampa per il turismo',
                      u'Inserire Comunicati stampa e Article Marketing',
                      u'Video Quacos'],
     'author_blogs_link': [u'http://comunicati-stampa-per-il-turismo.blogspot.com/',
                           u'http://comunicati-stampa-vendita-online.blogspot.com/',
                           u'http://quacos.blogspot.com/'],
     'blogger_since': [u'2008'],
     'blogs_follow': [u'Abandonware Time',
                      u'AltroSeo.com',
                      u'ANSIMA notizie',
                      u'Cinnamon Girl',
                      u'enigmamigarun',
                      u'Fake Books - Libri di una riga.',
                      u'FM - COSMETICA E NON SOLO ',
                      u'GS BARBARIANS',
                      u'Il Disinformatico',
                      u'Linus&#39; blog',
                      u'Montefeltro Nuoto Master',
                      u'Nella Tana del Coniglio',
                      u'PHP and tips'],
     'blogs_follow_link': [u'http://squakenet.blogspot.com/',
                           u'http://www.altroseo.com/',
                           u'http://ansima.blogspot.com/',
                           u'http://cinnamongirl82.blogspot.com/',
                           u'http://enigmaamigarun.blogspot.com/',
                           u'http://fake-books.blogspot.com/',
                           u'http://valeriacosmeticafm.blogspot.com/',
                           u'http://gsbarbarians.blogspot.com/',
                           u'http://attivissimo.blogspot.com/',
                           u'http://torvalds-family.blogspot.com/',
                           u'http://montefeltronuotomaster.blogspot.com/',
                           u'http://anonimoconiglio.blogspot.com/',
                           u'http://phpntips.blogspot.com/'],
     'books': [],
     'country': [],
     'email1': [u'bloggiovanni.cappellini@gmail.com'],
     'email2': [u'cappogio@hotmail.com'],
     'films': [],
     'gender': [],
     'industry': [],
     'interests': [],
     'introduction': [],
     'locality': [],
     'music': [],
     'occupation': [],
     'profile_views': [u'553'],
     'region': [],
     'website': [u'http://www.quacos.com']}
2013-03-08 09:02:28-0600 [bloggerx] INFO: Closing spider (finished)
2013-03-08 09:02:28-0600 [bloggerx] INFO: Dumping Scrapy stats:
    {'downloader/request_bytes': 288,
     'downloader/request_count': 1,
     'downloader/request_method_count/GET': 1,
     'downloader/response_bytes': 13615,
     'downloader/response_count': 1,
     'downloader/response_status_count/200': 1,
     'finish_reason': 'finished',
     'finish_time': datetime.datetime(2013, 3, 8, 15, 2, 28, 948533),
     'item_scraped_count': 1,
     'log_count/DEBUG': 9,
     'log_count/INFO': 4,
     'response_received_count': 1,
     'scheduler/dequeued': 1,
     'scheduler/dequeued/memory': 1,
     'scheduler/enqueued': 1,
     'scheduler/enqueued/memory': 1,
     'start_time': datetime.datetime(2013, 3, 8, 15, 2, 28, 379242)}
2013-03-08 09:02:28-0600 [bloggerx] INFO: Spider closed (finished)

Spider:

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from bloggerx.items import BloggerxItem

class BloggerxSpider(BaseSpider):
    name = 'bloggerx'
    allowed_domains = ['blogger.com']
    start_urls = ['http://www.blogger.com/profile/07372831905432746031']

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        item = BloggerxItem()
        item['gender'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Gender")]/following-sibling::node()/text()').extract()
        item['blogger_since'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[2]/text()').re('\d+')
        item['profile_views'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[3]/text()').re('\d+')
        item['industry'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Industry")]/following-sibling::node()/span/a/text()').extract()
        item['occupation'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Occupation")]/following-sibling::node()/span/a/text()').extract()
        item['locality'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[@class="locality"]/a/text()').extract()
        item['region'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[@class="region"]/a/text()').extract()
        item['country'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[@class="country-name"]/a/text()').extract()
        item['introduction'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Introduction")]/following-sibling::node()/text()').extract()
        item['interests'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Interests")]/following-sibling::node()/span/a/text()').extract()
        item['email1'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li/script/text()').re('[\w.]+@[\w.]+[com]')
        item['email2'] = hxs.select('/html/body/div[2]/div/div[2]/div/ul/li[3]/div/text()').extract()
        item['website'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li[2]/a/@href').extract()
        item['films'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Films")]/following-sibling::node()/span/a/text()').extract()
        item['music'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Music")]/following-sibling::node()/span/a/text()').extract()
        item['books'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Books")]/following-sibling::node()/span/a/text()').extract()
        item['blogs_follow'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/text()').extract()
        item['blogs_follow_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/@href').extract()
        item['author_blogs'] = hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/text()').extract()
        item['author_blogs_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/@href').extract()

        return item

I dint made the changes in above code to include BaseSpider , it does compile after including BaseSpider , so output is after making those changes, + 404 because i havent provided the actual link replace abcr with blogger.com and run it . Thanks for time — Harshit, Mar 08 '13 at 04:43

empty scraper output while individual hxs.select works?

4 Answers4