I have a spider that grabs URL's from a MySQL DB and uses those URL's as the start_urls to scrape, which in turn grabs any number of new links from the scraped pages. When I set the pipeline to INSERT both the start_url and new scraped url to a new DB or when I set the pipeline to UPDATE the existing DB with the newly scraped URL's using the start_url as the WHERE criteria, I get a SQL syntax error.
When I INSERT just one or the other, I don't get the error.
Here is the spider.py
import scrapy
import MySQLdb
import MySQLdb.cursors
from scrapy.http.request import Request
from youtubephase2.items import Youtubephase2Item
class youtubephase2(scrapy.Spider):
name = 'youtubephase2'
def start_requests(self):
conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=True)
cursor = conn.cursor()
cursor.execute('SELECT resultURL FROM SearchResults;')
rows = cursor.fetchall()
for row in rows:
if row:
yield Request(row[0], self.parse, meta=dict(start_url=row[0]))
cursor.close()
def parse(self, response):
for sel in response.xpath('//a[contains(@class, "yt-uix-servicelink")]'):
item = Youtubephase2Item()
item['newurl'] = sel.xpath('@href').extract()
item['start_url'] = response.meta['start_url']
yield item
Here is the pipeline.py which shows all three self.cursor.execute statements
import MySQLdb
import MySQLdb.cursors
import hashlib
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
from youtubephase2.items import Youtubephase2Item
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(user='uname', passwd='password', db='YouTubeScrape', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
#self.cursor.execute("""UPDATE SearchResults SET NewURL = %s WHERE ResultURL = %s VALUES (%s, %s)""",(item['newurl'], item['start_url']))
#self.cursor.execute("""UPDATE SearchResults SET NewURL = %s WHERE ResultURL = %s""",(item['newurl'], item['start_url']))
self.cursor.execute("""INSERT INTO TestResults (NewURL, StartURL) VALUES (%s, %s)""",(item['newurl'], item['start_url']))
self.conn.commit()
except MySQLdb.Error, e:
log.msg("Error %d: %s" % (e.args[0], e.args[1]))
return item
The topmost SQL execute statement returns this error:
2017-04-13 18:29:34 [scrapy.core.scraper] ERROR: Error processing {'newurl': [u'http://www.tagband.co.uk/'],
'start_url': u'https://www.youtube.com/watch?v=UqguztfQPho'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/root/scraping/youtubephase2/youtubephase2/pipelines.py", line 18, in process_item
self.cursor.execute("""UPDATE SearchResults SET AffiliateURL = %s WHERE ResultURL = %s VALUES (%s, %s)""",(item['affiliateurl'], item['start_url']))
File "/usr/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 159, in execute
query = query % db.literal(args)
TypeError: not enough arguments for format string
The middle SQL execute statement returns this error:
2017-04-13 18:33:18 [scrapy.log] INFO: Error 1064: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near ') WHERE ResultURL = 'https://www.youtube.com/watch?v=UqguztfQPho'' at line 1
2017-04-13 18:33:18 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.youtube.com/watch?v=UqguztfQPho>
{'newurl': [u'http://www.tagband.co.uk/'],
'start_url': u'https://www.youtube.com/watch?v=UqguztfQPho'}
The last SQL execute statement returns the same error as the middle even when using an INSERT in to a new database. Seems to add an extra single quote. The last one works when I INSERT only one of the items in to the database.
2017-04-13 18:36:40 [scrapy.log] INFO: Error 1064: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '), 'https://www.youtube.com/watch?v=UqguztfQPho')' at line 1
2017-04-13 18:36:40 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.youtube.com/watch?v=UqguztfQPho>
{'newurl': [u'http://www.tagband.co.uk/'],
'start_url': u'https://www.youtube.com/watch?v=UqguztfQPho'}
Sorry about the long post. Trying to be thorough.