0

I know why i have this problem, but i don't know how to resolve it. Same problem is described here.

My spider:

    import scrapy

    from esc.items import StackItem


    class StackSpider(scrapy.Spider):
        name = "stack"
        allowed_domains = ["www.exemple.com"]
        start_urls = [
            "http://www.exemple.com/somethin/?pag=1&pagesize=1000",
        ]

        def parse(self, response):
            for href in response.xpath('//td[@class="item_txt"]/a[@class="item_title"]/@href'):
                url = response.urljoin(href.extract())
                yield scrapy.Request(url, callback=self.mainpage)

        def mainpage(self, response):
            for sel in response.xpath('//td[@class="item_txt"]'):
                item = StackItem()
                item['title'] = sel.xpath('a[@class="item_title"]/text()').extract()
                item['url'] = sel.xpath('a[@class="item_title"]/@href').extract()
                yield item

            for poz in response.xpath('//div[@class="ImageLimiter"]'):
                item = StackItem()
                item['poza'] = poz.xpath('img/@src').extract()
                yield item

            for poz2 in response.xpath('//div[@id="col_details"]/div[2]'):
                item = StackItem()
                item['poza2'] = poz2.xpath('img[@id="previewDiv"]/@src').extract()
                yield item

            for tel in response.xpath('//div[@id="contact_by_phone"]/div'):
                item = StackItem()
                item['tel'] = tel.xpath('img[@id="imgPhone"]/@src').extract()
                yield item

            for desc in response.xpath('//div[@id="col_details"]'):
                item = StackItem()
                item['desc'] = desc.xpath('span[@id="ad_description"]/text()').extract()
                yield item

            for loc in response.xpath('//div[@id="col_details"]'):
                item = StackItem()
                item['judet'] = loc.xpath('div[6]/strong/a/text()').extract()
                item['oras'] = loc.xpath('div[7]/strong/a/text()').extract()
                yield item

My pipeline:

import sys
import MySQLdb
import hashlib
import time
import datetime
from scrapy.exceptions import NotConfigured
from scrapy.exceptions import DropItem
from scrapy.http import Request
from esc.items import StackItem


# atime = time.strptime('my date', "%b %d %Y %H:%M")
timestamp = datetime.datetime.now().strftime("%s")
class MySQLStorePipeline(object):

    def __init__(self):
        try:
            self.conn= MySQLdb.connect(user='foo', passwd='bar', host='exemple.com', db='foo', use_unicode=True, charset='utf8')
            self.cursor = self.conn.cursor()
            self.cursor.execute("CREATE TABLE IF NOT EXISTS crawler( idscrapedData INT NOT NULL AUTO_INCREMENT PRIMARY KEY, title VARCHAR(200), url VARCHAR(200), judet VARCHAR(200), oras VARCHAR(200), descriere VARCHAR(100), poza VARCHAR(100), descriere2 VARCHAR(5000), telefon VARCHAR(100), telefon2 VARCHAR(200)) ")
            self.conn.commit()
        except (AttributeError, MySQLdb.OperationalError), e:
            raise e

    def process_item(self, item, spider):
        try:
            self.cursor.execute( "INSERT INTO crawler ( url, title, created_at, telefon, oras, judet, poza, poza2, descriere ) VALUES (%s0, %s, %s, %s, %s, %s, %s, %s, %s)",
            (item['url'],
            item['title'],
            timestamp,
            item['tel'],
            item['oras'],
            item['judet'],
            item['poza'],
            item['poza2'],
            item['desc'] ))
            # self.cursor.execute( "INSERT INTO crawler ( url, title, telefon, oras, judet, poza, poza2, descriere ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)", (item['url'], item['title'], item['tel'], item['oras'], item['judet'], item['poza'], item['poza2'], item['desc'] ))
            self.conn.commit()

        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])

            return item

the result in items.json:

[{"url": ["example.com"], "title": ["some title"]},
{"url": ["example.com"], "title": ["some title"]},
{"url": ["http:example.com"], "title": ["some title"]},
{"url": ["exeample.com"], "title": ["some title"]},
{"foo": ["example"]},
{"foo": ["example"]},
{"foo": ["example"]},
....

So the problem is because of {} i know that is because of the spider when i call for, but i don't know what i need to do in order to get the info. I hope i was explicit enough. I don't know what to do.

EmbargoLacuna
  • 15
  • 1
  • 1
  • 3
  • PS basically i need to get the info scraped info in the database, but the info i get is not formatted the way it need. Is there a way to change the spider or maybe to change the pipeline so that it might reformat the date in way that mysql wold accept – EmbargoLacuna Dec 16 '15 at 16:28
  • Take a look at http://stackoverflow.com/questions/23894139/scrapy-item-loader-return-list-not-single-value – Ricardo Silva Dec 16 '15 at 19:14

2 Answers2

1

your items are lists, just change them to string:

different options: .extract_first(), .extract()[0], ''.join(...), string(mylist)

eLRuLL
  • 18,488
  • 9
  • 73
  • 99
0

as an example:

item['title'] = sel.xpath('a[@class="item_title"]/text()').extract()[0]
item['url'] = sel.xpath('a[@class="item_title"]/@href').extract()[0]
kmicks
  • 3
  • 3