I got multiple Spiders with different items and i want to export each item into a different csv file. I used the code example from How can scrapy export items to separate csv files per item, but there is a problem.
Right now my spider will only write the "page" item. All items is filled in the shell but the files keep being empty. I debugged the pipeline, but I didn't found an error so far.
Here is my spider:
import csv
import scrapy
from BeautifulSoup import BeautifulSoup
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spiders import Rule
from DataSpiders import CSV_PATH
from ScrapingItems import TrierDeItem
from SuperSpider import SuperSpider
HTML_PATH = 'pages/trier.de/'
class TrierDeSpider(scrapy.Spider, SuperSpider):
name = 'trierDeSpider'
allowed_domains = ['trier.de']
denied_domains = []
start_urls = [
'https://www.trier.de/rathaus-buerger-in/trier-in-zahlen/',
'https://trier.de/startseite/',
'https://www.trier.de/leben-in-trier/',
'https://www.trier.de/kultur-freizeit/',
'https://www.trier.de/wirtschaft-arbeit/',
'https://www.trier.de/bildung-wissenschaft/',
'https://www.trier.de/bauen-wohnen/',
'https://www.trier.de/umwelt-verkehr/',
]
# Set starting point for the spider and starts crawling from start_urls
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse', follow=True),)
def parse(self, response):
"""
Parse for Links Page Body. Follow allowed Domains by adding them to the request. Parse the current page with
callback and the method parse_page.
:param response:
:return:
"""
for link in LxmlLinkExtractor(allow=self.allowed_domains, deny=self.denied_domains).extract_links(response):
yield scrapy.Request(response.urljoin(link.url), callback=self.parse_page)
def parse_page(self, response):
"""
Parse the current page for information.
:param response:
:return:
"""
trier_de_item = TrierDeItem()
yield self.parse_general_page_info(response, HTML_PATH)
# extract the page url
trier_de_item["url"] = response.url
# extract the crawling datetime
trier_de_item["crawling_date_time"] = response.headers['Date']
# extract page title
trier_de_item["title"] = response.css('title::text').extract()
# extract description tags
trier_de_item["description"] = response.xpath('//meta[@name="description"]/@content').extract()
trier_de_item["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
# extract all page headers
trier_de_item["news_title"] = response.xpath('//div[@class="dachzeile"]/text()').extract()
# extract topic
trier_de_item["topic"] = response.xpath('//div[@class="topic"]/text()').extract()
# extract headlines
trier_de_item['headlines'] = response.xpath('//h1/text()').extract()
# check if page contains a table
table = response.xpath('//table[@class="datentabelle"]').extract()
if len(table) > 0:
self.parse_table(response.body, trier_de_item['headlines'][0])
yield trier_de_item
@staticmethod
def parse_table(body_html, title):
'''
Parse HTML Page with table and save to csv file
:param body_html:
:param title:
:return:
'''
title = title.replace('/', '')
try:
# Create Filename from title
filename = title + '.csv'
soup = BeautifulSoup(body_html)
soup.prettify('utf-8')
content = []
# find all tables in html
tables = soup.findAll('table')
for table in tables:
# find reach table row
for row in table.findAll('tr'):
# extract each table header and row and extract text to line from each row
line = []
for header in row.findAll('th'):
if ' ' in header.text:
line.append('')
else:
line.append(header.text)
for row in row.findAll('td'):
if ' ' in row.text:
line.append('')
else:
line.append(row.text)
content.append(line)
# Open a new csv file an write each line to the file
with open(CSV_PATH + filename, 'wb') as csv_file:
wr = csv.writer(csv_file)
for line in content:
wr.writerow(line)
except Exception as e:
print(e)
pass
SuperSpider:
import urlparse
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from DataSpiders import write_html
from DataSpiders.ScrapingItems import PageItem, BaseItem
ALLOWED_FILE_TYPES = ('.pdf', '.csv', '.xls', '.xlsx')
class SuperSpider:
def __init__(self):
pass
def url_join(self, urls, response):
'''
Join URL with response
:param urls:
:param response:
:return:
'''
joined_urls = []
for url in urls:
joined_urls.append(response.urljoin(url))
return joined_urls
def parse_general_page_info(self, response, HTML_PATH):
page_item = PageItem()
page_item["url"] = response.url
# extract respones body
if 'jsp' in response.url:
url = response.url.split('.jsp')
write_html(url[0], response.body, HTML_PATH)
elif '?' in response.url:
url = response.url.split('?')
write_html(url[0], response.body, HTML_PATH)
else:
write_html(response.url, response.body, HTML_PATH)
# Search for files that contain any allowed file type
found_files = []
domain = response.url.split('/')[2]
for a in response.xpath('//a[@href]/@href'):
link = a.extract()
if link.endswith(ALLOWED_FILE_TYPES):
link = urlparse.urljoin(domain, link)
found_files.append(link)
# extract all refering links
extractor = LxmlLinkExtractor()
linklist = []
for link in extractor.extract_links(response):
# extract links which contain a file in url and add those to 'found_files' for downloading
if '?imgUid' in link.url:
fullpath = link.url
path = fullpath.split('.de')[1]
found_files.append(urlparse.urljoin(domain, path))
else:
linklist.append(link.url)
page_item["links"] = linklist
# add all files to lokaloItem
page_item["file_urls"] = self.url_join(found_files, response)
# extract page title
page_item["title"] = response.css('title::text').extract()
# extract all image urls
relative_img_urls = response.css("img::attr(src)").extract()
page_item["image_urls"] = self.url_join(relative_img_urls, response)
return page_item
def parse_base_page_information(self, response):
baseItem = BaseItem()
baseItem["url"] = response.url
# extract page title
baseItem["title"] = response.css('title::text').extract()
baseItem["crawling_date_time"] = response.headers['Date']
# extract description tags
baseItem["description"] = response.xpath('//meta[@name="description"]/@content').extract()
baseItem["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
baseItem['headlines'] = response.xpath('//h1/text()').extract()
return baseItem
ScrapingItems:
from scrapy import Item, Field
class PageItem(Item):
url = Field()
title = Field()
image_urls = Field()
file_urls = Field()
links = Field()
class BaseItem(Item):
url = Field()
title = Field()
crawling_date_time = Field()
description = Field()
og_description = Field()
headlines = Field()
class TrierDeItem(BaseItem):
news_title = Field()
tag = Field()
topic = Field()
And the Multi CSV Pipeline:
class MultiCSVItemPipeline(object):
CSVPath = "csv_data/"
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([(name, open(self.CSVPath + name + '.csv', 'ab')) for name in self.SaveTypes])
self.exporters = dict([(name, CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
def item_type(item):
'''
Returns the scraping item name
:param item:
:return:
'''
return type(item).__name__.replace('Item', '').lower()
I haven't found a solution to this right now, but I tried several things that failed.
- yield list of items, which doesn't work with scrapy
- yield only one item and create two parse methods for page_item and trier_item
- delete all SaveTypes but 'trierde'. The spider didn't write anything
So, relating to these options I tried I believe, that there is some error with the pipeline itself... I appreciate any help anybody can offer.
Additional Info: Before changing my pipeline to MultiCSV I was able to save each item to csv.