2

I got multiple Spiders with different items and i want to export each item into a different csv file. I used the code example from How can scrapy export items to separate csv files per item, but there is a problem.

Right now my spider will only write the "page" item. All items is filled in the shell but the files keep being empty. I debugged the pipeline, but I didn't found an error so far.

Here is my spider:

import csv

import scrapy
from BeautifulSoup import BeautifulSoup
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spiders import Rule

from DataSpiders import CSV_PATH
from ScrapingItems import TrierDeItem
from SuperSpider import SuperSpider

HTML_PATH = 'pages/trier.de/'


class TrierDeSpider(scrapy.Spider, SuperSpider):
    name = 'trierDeSpider'

    allowed_domains = ['trier.de']
    denied_domains = []
    start_urls = [
        'https://www.trier.de/rathaus-buerger-in/trier-in-zahlen/',
        'https://trier.de/startseite/',
        'https://www.trier.de/leben-in-trier/',
        'https://www.trier.de/kultur-freizeit/',
        'https://www.trier.de/wirtschaft-arbeit/',
        'https://www.trier.de/bildung-wissenschaft/',
        'https://www.trier.de/bauen-wohnen/',
        'https://www.trier.de/umwelt-verkehr/',
    ]
    # Set starting point for the spider and starts crawling from start_urls
    rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse', follow=True),)

    def parse(self, response):
        """
        Parse for Links Page Body. Follow allowed Domains by adding them to the request. Parse the current page with
            callback and the method parse_page.
        :param response:
        :return:
        """
        for link in LxmlLinkExtractor(allow=self.allowed_domains, deny=self.denied_domains).extract_links(response):
            yield scrapy.Request(response.urljoin(link.url), callback=self.parse_page)

    def parse_page(self, response):
        """
        Parse the current page for information.
        :param response: 
        :return: 
        """
        trier_de_item = TrierDeItem()
        yield self.parse_general_page_info(response, HTML_PATH)
        # extract the page url
        trier_de_item["url"] = response.url
        # extract the crawling datetime
        trier_de_item["crawling_date_time"] = response.headers['Date']
        # extract page title
        trier_de_item["title"] = response.css('title::text').extract()
        # extract description tags
        trier_de_item["description"] = response.xpath('//meta[@name="description"]/@content').extract()
        trier_de_item["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
        # extract all page headers
        trier_de_item["news_title"] = response.xpath('//div[@class="dachzeile"]/text()').extract()
        # extract topic
        trier_de_item["topic"] = response.xpath('//div[@class="topic"]/text()').extract()
        # extract headlines
        trier_de_item['headlines'] = response.xpath('//h1/text()').extract()

        # check if page contains a table
        table = response.xpath('//table[@class="datentabelle"]').extract()
        if len(table) > 0:
            self.parse_table(response.body, trier_de_item['headlines'][0])
        yield trier_de_item

    @staticmethod
    def parse_table(body_html, title):
        '''
        Parse HTML Page with table and save to csv file
        :param body_html:
        :param title:
        :return:
        '''
        title = title.replace('/', '')
        try:
            # Create Filename from title
            filename = title + '.csv'
            soup = BeautifulSoup(body_html)
            soup.prettify('utf-8')
            content = []
            # find all tables in html
            tables = soup.findAll('table')
            for table in tables:
                # find reach table row
                for row in table.findAll('tr'):
                    # extract each table header and row and extract text to line from each row
                    line = []
                    for header in row.findAll('th'):
                        if ' ' in header.text:
                            line.append('')
                        else:
                            line.append(header.text)
                    for row in row.findAll('td'):
                        if ' ' in row.text:
                            line.append('')
                        else:
                            line.append(row.text)
                    content.append(line)
            # Open a new csv file an write each line to the file
            with open(CSV_PATH + filename, 'wb') as csv_file:
                wr = csv.writer(csv_file)
                for line in content:
                    wr.writerow(line)
        except Exception as e:
            print(e)
            pass

SuperSpider:

import urlparse

from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

from DataSpiders import write_html
from DataSpiders.ScrapingItems import PageItem, BaseItem

ALLOWED_FILE_TYPES = ('.pdf', '.csv', '.xls', '.xlsx')


class SuperSpider:
    def __init__(self):
        pass

    def url_join(self, urls, response):
        '''
        Join URL with response
        :param urls:
        :param response:
        :return:
        '''
        joined_urls = []
        for url in urls:
            joined_urls.append(response.urljoin(url))

        return joined_urls

    def parse_general_page_info(self, response, HTML_PATH):
        page_item = PageItem()
        page_item["url"] = response.url
        # extract respones body
        if 'jsp' in response.url:
            url = response.url.split('.jsp')
            write_html(url[0], response.body, HTML_PATH)
        elif '?' in response.url:
            url = response.url.split('?')
            write_html(url[0], response.body, HTML_PATH)
        else:
            write_html(response.url, response.body, HTML_PATH)
        # Search for files that contain any allowed file type
        found_files = []
        domain = response.url.split('/')[2]
        for a in response.xpath('//a[@href]/@href'):
            link = a.extract()
            if link.endswith(ALLOWED_FILE_TYPES):
                link = urlparse.urljoin(domain, link)
                found_files.append(link)
        # extract all refering links
        extractor = LxmlLinkExtractor()
        linklist = []
        for link in extractor.extract_links(response):
            # extract links which contain a file in url and add those to 'found_files' for downloading
            if '?imgUid' in link.url:
                fullpath = link.url
                path = fullpath.split('.de')[1]
                found_files.append(urlparse.urljoin(domain, path))
            else:
                linklist.append(link.url)
        page_item["links"] = linklist
        # add all files to lokaloItem
        page_item["file_urls"] = self.url_join(found_files, response)
        # extract page title
        page_item["title"] = response.css('title::text').extract()

        # extract all image urls
        relative_img_urls = response.css("img::attr(src)").extract()
        page_item["image_urls"] = self.url_join(relative_img_urls, response)

        return page_item

    def parse_base_page_information(self, response):
        baseItem = BaseItem()
        baseItem["url"] = response.url
        # extract page title
        baseItem["title"] = response.css('title::text').extract()
        baseItem["crawling_date_time"] = response.headers['Date']
        # extract description tags
        baseItem["description"] = response.xpath('//meta[@name="description"]/@content').extract()
        baseItem["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
        baseItem['headlines'] = response.xpath('//h1/text()').extract()
        return baseItem

ScrapingItems:

from scrapy import Item, Field


class PageItem(Item):
    url = Field()
    title = Field()
    image_urls = Field()
    file_urls = Field()
    links = Field()


class BaseItem(Item):
    url = Field()
    title = Field()
    crawling_date_time = Field()
    description = Field()
    og_description = Field()
    headlines = Field()


class TrierDeItem(BaseItem):
    news_title = Field()
    tag = Field()
    topic = Field()

And the Multi CSV Pipeline:

class MultiCSVItemPipeline(object):
    CSVPath = "csv_data/"
    SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']

    def __init__(self):
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        self.files = dict([(name, open(self.CSVPath + name + '.csv', 'ab')) for name in self.SaveTypes])
        self.exporters = dict([(name, CsvItemExporter(self.files[name])) for name in self.SaveTypes])
        [e.start_exporting() for e in self.exporters.values()]

    def spider_closed(self, spider):
        [e.finish_exporting() for e in self.exporters.values()]
        [f.close() for f in self.files.values()]

    def process_item(self, item, spider):
        what = item_type(item)
        if what in set(self.SaveTypes):
            self.exporters[what].export_item(item)
        return item


def item_type(item):
    '''
    Returns the scraping item name
    :param item:
    :return:
    '''
    return type(item).__name__.replace('Item', '').lower()

I haven't found a solution to this right now, but I tried several things that failed.

  • yield list of items, which doesn't work with scrapy
  • yield only one item and create two parse methods for page_item and trier_item
  • delete all SaveTypes but 'trierde'. The spider didn't write anything

So, relating to these options I tried I believe, that there is some error with the pipeline itself... I appreciate any help anybody can offer.

Additional Info: Before changing my pipeline to MultiCSV I was able to save each item to csv.

OLF
  • 29
  • 6

1 Answers1

1

After I wasn't able to fix the problem with the Scrapy exporter I decided to create my own exporter.

Here's the code for everyone who want's to export multiple, different Items to different csv files in one or more spiders. It worked for me so far, but I'm still checking the code for errors. Feel free to reply, if you got some ideas for improvement.

class MultiCSVItemPipeline(object):
    # Subfolder path, where the csv files are stored
    CSVPath = "csv_data/"
    # All allowed items
    SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
    # List for already checked csv headers
    CheckedHeaders = []

    def __init__(self):
        import sys
        reload(sys)
        sys.setdefaultencoding('utf8')
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        # Check if items exists and create new ones if not
        for file in set(self.SaveTypes):
            f = open(self.CSVPath + file + '.csv', 'a+')
            f.close()

    def spider_closed(self, spider):
        #  not needed anymore
        # [e.finish_exporting() for e in self.exporters.values()]
        # [f.close() for f in self.files.values()]
        pass

    def process_item(self, item, spider):
        what = item_type(item)
        if what in set(self.SaveTypes):
            try:
                # Check if csv file contains header, but only those, that aren't checked
                if what not in self.CheckedHeaders:
                    self.check_header(what, item)
                self.write_item_to_row(item, what)
            except Exception as e:
                logging.error("########################################################")
                logging.error("Error writing to " + what + ".csv file ")
                logging.error("Error Message: " + e.message)
                logging.error("Error Reason: " + e.reason)
                logging.error("Error Object: " + e.object)
                logging.error("########################################################")
        return item

    def write_item_to_row(self, item, what):
        """
        Write a single item to a row in csv file
        :param item:
        :param what:
        :return:
        """
        ofile = open(self.CSVPath + what + '.csv', "ab")
        writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        item_dict = item.__dict__['_values']
        row = []
        for k in item_dict:
            d = item_dict[k]
            # Ig item is not a list join the element to string, replace all delimiters and set encoding to utf-8
            if not isinstance(d, types.ListType):
                value = ''.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
            else:
                value = ','.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
            row.append(value)
        writer.writerow(row)
        ofile.close()

    def check_header(self, what, item):
        """
        Check if the file contains header elements and create if missing
        :param what:
        :param item:
        :return:
        """
        try:
            with open(self.CSVPath + what + '.csv', 'ab+') as csvfile:
                writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
                item_dict = item.__dict__['_values']
                # If file is empty, create new csv header
                if os.stat(self.CSVPath + what + '.csv').st_size == 0:
                    self.write_csv_header(item_dict, writer)
                else:
                    # Read first row and check header elements
                    read_csv = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
                    first_row = read_csv.next()
                    # if not all headers are set in the csv file, print warning
                    if not self.check_key_in_csv_header(item_dict, first_row):
                        # TODO: Add missing header to the csv file
                        logging.warning("Wrong headers for file " + what + ".csv")
                self.CheckedHeaders.append(what)
                csvfile.close()
                return True
        except Exception as e:
            logging.error(e.message)
            return False

    @staticmethod
    def write_csv_header(item_dict, writer):
        """
        Write header of a csv file.
        Header is writen from each keys in the scrapy item
        :param item_dict:
        :param writer:
        :return:
        """
        first_row = []
        for k in item_dict:
            # Join each Key to a string, delete delimiters and encode to utf-8
            value = ''.join(k).replace('\t', '').replace('\n', '').encode('utf8')
            first_row.append(value)
        writer.writerow(first_row)

    @staticmethod
    def check_key_in_csv_header(item_dict, row):
        """
        Check, for each item key, if it's contained in the first line of the csv
        k (key) stands for each dictionary key of the scrapy item.
        :param item_dict:
        :param row:
        :return:
        """
        for k in item_dict:
            if k not in row:
                return False
        return True
OLF
  • 29
  • 6