This is my first time doing this, so I better apologize in advance for my rookie mistakes. I'm trying to scrape legacy.com for the first page results from searching for a first and last name within the state. I'm new to programming, and was using scraperwiki to do the code. It worked, but I ran out of cpu time long before the 10,000 ish queries had time to process. Now I'm trying to save progress, catch when it time is running low, and then resume from where it left off.
I can't get the save to work, and any help with the other parts would be appreciated as well. As of now I'm just grabbing links, but if there was a way to save the main content of the linked pages that would be really helpful as well.
Here's my code:
import scraperwiki
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
f = open('/tmp/workfile', 'w')
#read database, find last, start from there
def searchname(fname, lname, id, stateid):
url = 'http://www.legacy.com/ns/obitfinder/obituary-search.aspx?daterange=Last1Yrs&firstname= %s &lastname= %s &countryid=1&stateid=%s&affiliateid=all' % (fname, lname, stateid)
obits=urlopen(url)
soup=BeautifulSoup(obits)
obits_links=soup.findAll("div", {"class":"obitName"})
print obits_links
s = str(obits_links)
id2 = int(id)
f.write(s)
#save the database here
scraperwiki.sqlite.save(unique_keys=['id2'], data=['id2', 'fname', 'lname', 'state_id', 's'])
# Import Data from CSV
import scraperwiki
data = scraperwiki.scrape("https://dl.dropbox.com/u/14390755/legacy.csv")
import csv
reader = csv.DictReader(data.splitlines())
for row in reader:
#scraperwiki.sqlite.save(unique_keys=['id'], 'fname', 'lname', 'state_id', data=row)
FNAME = str(row['fname'])
LNAME = str(row['lname'])
ID = str(row['id'])
STATE = str(row['state_id'])
print "Person: %s %s" % (FNAME,LNAME)
searchname(FNAME, LNAME, ID, STATE)
f.close()
f = open('/tmp/workfile', 'r')
data = f.read()
print data