Scrapinghub mySQL Pipeline

Question

I'm trying to create a Scrapy Pipeline that exports the scraped data to a mySQL database. I've written my script (pipeline.py):

from datetime import date time
from hashlib import md5
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi

class mySQLStorePipeline(object):

def __init__(self, dbpool):
    self.dbpool = dbpool

@classmethod
def from_settings(cls, settings):
    dbargs = dict(
        host=settings['HIDDEN'],
        db=settings['parsedjobs'],
        user=settings['scrapinghub'],
        passwd=settings['HIDDEN'],
        charset='utf8',
        use_unicode=True,
    )
    dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
    return cls(dbpool)

def process_item(self, item, spider):
    d = self.dbpool.runInteraction(self._do_upsert, item, spider)
    d.addErrback(self._handle_error, item, spider)
    d.addBoth(lambda _: item)
    return d

def _do_upsert(self, conn, item, spider):
    """Perform an insert or update."""
    sn = self.spider.name
    guid = self._get_guid(item)
    now = datetime.utcnow().replace(microsecond=0).isoformat(' ')

    conn.execute("""SELECT EXISTS(
        SELECT 1 FROM masterjobs WHERE guid = %s
    )""", (guid, ))
    ret = conn.fetchone()[0]

    if ret:
        conn.execute("""
            UPDATE masterjobs
            SET name=%s, website=%s, description=%s, url=%s, updated=%s
            WHERE guid=%s
        """, (item['name'], sn, item['description'], item['url'], now, guid))
        spider.log("Item updated in db: %s %r" % (guid, item))
    else:
        conn.execute("""
            INSERT INTO masterjobs (guid, website, name, description, url, updated)
            VALUES (%s, %s, %s, %s, %s, %s)
        """, (guid, sn, item['name'], item['description'], item['url'], now))
        spider.log("Item stored in db: %s %r" % (guid, item))

def _handle_error(self, failure, item, spider):
    """Handle occurred on db interaction."""
    log.err(failure)

def _get_guid(self, item):
    """Generates an unique identifier for a given item."""
    return md5(item['url']).hexdigest()

I'd like to turn all this into an egg so that it can be uploaded to Scrapinghub. How would I go about this? I've written a setup.py file and tried packaging it but I always get an error that it can't find the package.

You'll need to post the `setup.py` file contents if that's where the error is. Also, please post the full error. — ChrisP, Apr 19 '16 at 16:31
The setup.py file is as follows: `from setuptools import setup, find_packages setup(name='SQLPipeline', version='1.0', packages=find_packages(), )` — NickT, Apr 19 '16 at 16:33
@ChrisP, the error is this: `running bdist_egg running egg_info writing SQLPipeline.egg-info/PKG-INFO writing top-level names to SQLPipeline.egg-info/top_level.txt writing dependency_links to SQLPipeline.egg-info/dependency_links.txt warning: manifest_maker: standard file 'setup.py' not found reading manifest file 'SQLPipeline.egg-info/SOURCES.txt' writing manifest file 'SQLPipeline.egg-info/SOURCES.txt' installing library code to build/bdist.macosx-10.6-intel/egg running install_lib warning: install_lib: 'build/lib' does not exist -- no Python modules to install` — NickT, Apr 19 '16 at 16:37
`copying SQLPipeline.egg-info/PKG-INFO -> build/bdist.macosx-10.6-intel/egg/EGG-INFO copying SQLPipeline.egg-info/SOURCES.txt -> build/bdist.macosx-10.6-intel/egg/EGG-INFO copying SQLPipeline.egg-info/dependency_links.txt -> build/bdist.macosx-10.6-intel/egg/EGG-INFO copying SQLPipeline.egg-info/top_level.txt -> build/bdist.macosx-10.6-intel/egg/EGG-INFO zip_safe flag not set; analyzing archive contents... creating 'dist/SQLPipeline-1.0-py2.7.egg' and adding 'build/bdist.macosx-10.6-intel/egg' to it removing 'build/bdist.macosx-10.6-intel/egg' (and everything under it)` — NickT, Apr 19 '16 at 16:38

Scrapinghub mySQL Pipeline

0 Answers0