I have a spider that reads the start_urls from a MySQL database and scrapes an unknown number of links from each page. I want to use pipelines.py to update the database with the scraped links but I don't know how to get the start_url back in to the pipeline for the SQL UPDATE statement.
Here is the spider code which works.
import scrapy
import MySQLdb
import MySQLdb.cursors
from scrapy.http.request import Request
from youtubephase2.items import Youtubephase2Item
class youtubephase2(scrapy.Spider):
name = 'youtubephase2'
def start_requests(self):
conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=True)
cursor = conn.cursor()
cursor.execute('SELECT resultURL FROM SearchResults;')
rows = cursor.fetchall()
for row in rows:
if row:
yield Request(row[0], self.parse)
cursor.close()
def parse(self, response):
for sel in response.xpath('//a[contains(@class, "yt-uix-servicelink")]'):
item = Youtubephase2Item()
item['pageurl'] = sel.xpath('@href').extract()
yield item
And here is the pipeline.py where I want to update the database with the links scraped using the start_url as the WHERE criteria for the SQL UPDATE statement. So start_url in the SQL statement is a placeholder for what I would like to accomplish.
import MySQLdb
import MySQLdb.cursors
import hashlib
import re
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
from youtubephase2.items import Youtubephase2Item
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=true)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""UPDATE SearchResults SET PageURL = %s WHERE ResultURL = start_url[
VALUES (%s)""",
(item['pageurl']
))
self.conn.commit()
except MySQLdb.Error, e:
log.msg("Error %d: %s" % (e.args[0], e.args[1]))
return item
Hopefully my question is clear enough. I have used pipeline.py successfully in the past to insert items in to a database.