There are 2 spiders that I use to get data from webpage, and I use Crawler Process()
to run them at the same time.
The spiders' code:
class GDSpider(Spider):
name = "GenDis"
allowed_domains = ["gold.jgi.doe.gov"]
base_url ="https://gold.jgi.doe.gov/projects"
stmp = []
term = "man"
for i in range(1, 1000):
url = "https://gold.jgi.doe.gov/projects?page="+ str(i) +"&Project.Project+Name="+ term+ "&count=25"
stmp.append(url)
start_urls = stmp
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[@class="odd"]|//tr[@class="even"]')
for site in sites:
item = GenDis()
item['Id'] = site.xpath('td/a/text()').extract()
item['Link'] = site.xpath('td/a/@href').extract()
item['Name'] = map(unicode.strip, site.xpath('td[2]/text()').extract())
item['Status'] = map(unicode.strip, site.xpath('td[3]/text()').extract())
item['Add_Date'] = map(unicode.strip, site.xpath('td[4]/text()').extract())
yield item
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
term = "man"
start_urls = ["http://epgd.biosino.org/EPGD/search/textsearch.jsp?textquery="+term+"&submit=Feeling+Lucky"]
MONGODB_DB = name + "_" + term
MONGODB_COLLECTION = name + "_" + term
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[@class="odd"]|//tr[@class="even"]')
url_list = []
base_url = "http://epgd.biosino.org/EPGD"
for site in sites:
item = EPGD()
item['genID'] = map(unicode.strip, site.xpath('td[1]/a/text()').extract())
item['genID_url'] = base_url+map(unicode.strip, site.xpath('td[1]/a/@href').extract())[0][2:]
item['taxID'] = map(unicode.strip, site.xpath('td[2]/a/text()').extract())
item['taxID_url'] = map(unicode.strip, site.xpath('td[2]/a/@href').extract())
item['familyID'] = map(unicode.strip, site.xpath('td[3]/a/text()').extract())
item['familyID_url'] = base_url+map(unicode.strip, site.xpath('td[3]/a/@href').extract())[0][2:]
item['chromosome'] = map(unicode.strip, site.xpath('td[4]/text()').extract())
item['symbol'] = map(unicode.strip, site.xpath('td[5]/text()').extract())
item['description'] = map(unicode.strip, site.xpath('td[6]/text()').extract())
yield item
sel_tmp = Selector(response)
link = sel_tmp.xpath('//span[@id="quickPage"]')
for site in link:
url_list.append(site.xpath('a/@href').extract())
for i in range(len(url_list[0])):
if cmp(url_list[0][i], "#") == 0:
if i+1 < len(url_list[0]):
print url_list[0][i+1]
actual_url = "http://epgd.biosino.org/EPGD/search/"+ url_list[0][i+1]
yield Request(actual_url, callback=self.parse)
break
else:
print "The index is out of range!"
process = CrawlerProcess()
process.crawl(EPGD_spider)
process.crawl(GDSpider)
process.start() # the script will block here until all crawling jobs are finished
I want to save the data to MongoDB database. Here is my pipeline code:
class EPGD_pipeline(object):
def __init__(self):
connection = pymongo.MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
settings['MONGODB_DB'],
settings['MONGODB_COLLECTION'],
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']))
return item
It works correctly when I use one spider at one time. But when I run them at the same time, it seems that the pipeline doesn't work any more. Neither database nor collections have been set up.
I've seen the CrawlerProcess()
part of Scrapy document many times, but it doesn't mention about the pipeline things. So can anybody tell me what's wrong with my code?