1

Id like to parse pages and then export certain items to one csv file and other to another file: using feed exports here I managed to do it for one file as follows:

settings

FEED_EXPORT_FIELDS = (
    'url',
    'group_url',
    'name',
    'streetAddress',
    'city',
    'addressCountry',
)
FEED_FORMAT = 'csv'
FEED_URI = 'output/%(name)s_%(time)s.csv'

But as I said the above exports to only one csv file. Id like to be able to scrape other fields to another file:

FEED_EXPORT_FIELDS = (
    'employee',
    'employee_group',

)

my scraper parse:

def parse(self, response):

    l = BasicItemLoader(item=ProductItemLoader(), response=response)
    l.default_input_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)
    # l.default_output_processor = Compose(TakeFirst())
    l.add_value('url', response.request.url)
    l.add_value('group_url', response.meta.get('section', ''))
    l.add_css('name', 'h1[itemprop="name"]::text')
    l.add_css('streetAddress', "div[itemprop=\"address\"] [itemprop=\"streetAddress\"]::text")
    l.add_css('city', "div[itemprop=\"address\"]>p::text")
    l.add_css('addressCountry', "div[itemprop=\"address\"] [itemprop=\"addressCountry\"]::text")
    l.add_css('phone', ".phoneCompany>input[id*='freePhone']::attr(value)", TakeFirst())
    l.add_css('summary', 'span[itemprop="description"]::text')
    l.add_xpath('year', "//td[contains(text(),'Year established')]/following-sibling::td/text()")
    l.add_xpath('registry', "//td[contains(text(),'Registry of commerce')]/following-sibling::td/text()")
    l.add_xpath('legal_form', "//td[contains(text(),'Legal form')]/following-sibling::td/text()")
    l.add_xpath('vat', "//td[contains(text(),'VAT')]/following-sibling::td/text()")
    l.add_xpath('fax', "//td[contains(text(),'Fax')]/following-sibling::td/text()")
    l.add_css('website', "[id*='webSite_presentation_']::text")
    l.add_css('brands', "#company-tradenames .tradeName::text")
    l.add_xpath('banks', "//h3[contains(text(),'Banks')]/following-sibling::div//strong/text()")
    l.add_css('export_area', "#exportZones>span:nth-of-type(2)::text")
    l.add_css('import_area', "#importZones>span:nth-of-type(2)::text")
    l.add_css('export_countries', "#exportCountries>span:nth-of-type(2)::text")
    l.add_css('import_countries', "#importCountries>span:nth-of-type(2)::text")
    l.add_css('employees', ".employees.bloc .effectif p::text")
    l.add_css('turn_over', ".turnover.bloc li:nth-of-type(1)>p:nth-of-type(2)::text")
    return l.load_item()

and items definition

class ProductItemLoader(scrapy.Item):

    url = scrapy.Field()
    group_url = scrapy.Field()
    name = scrapy.Field()
    streetAddress = scrapy.Field()
    addressCountry = scrapy.Field()
    city = scrapy.Field()
    phone = scrapy.Field()
    summary = scrapy.Field()
    year = scrapy.Field()
    registry = scrapy.Field()
    legal_form = scrapy.Field()
    vat = scrapy.Field()
    fax = scrapy.Field()
    website = scrapy.Field()
    brands = scrapy.Field()
    banks = scrapy.Field()
    import_area = scrapy.Field()
    import_countries = scrapy.Field()
    export_area = scrapy.Field()
    export_countries = scrapy.Field()
    employees = scrapy.Field()
    turn_over = scrapy.Field()
rafalf
  • 425
  • 7
  • 16

2 Answers2

2

You will have to use your items definition to achieve saving different fields to their own csv files.

items.py:
import scrapy

class ProductItemLoader(scrapy.Item):
    url = scrapy.Field()
    group_url = scrapy.Field()
    name = scrapy.Field()
    streetAddress = scrapy.Field()
    addressCountry = scrapy.Field()
    city = scrapy.Field()
    phone = scrapy.Field()
    summary = scrapy.Field()
    year = scrapy.Field()
    registry = scrapy.Field()
    legal_form = scrapy.Field()
    vat = scrapy.Field()

class EmployeeLoader(scrapy.Item):
    fax = scrapy.Field()
    website = scrapy.Field()
    brands = scrapy.Field()
    banks = scrapy.Field()
    import_area = scrapy.Field()
    import_countries = scrapy.Field()
    export_area = scrapy.Field()
    export_countries = scrapy.Field()
    employees = scrapy.Field()
    turn_over = scrapy.Field()

pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher

def item_type(item):
    # The CSV file names are used (imported) from the scrapy spider.
    return type(item)

class YourSitePipelineHere(object):
    # For simplicity, I'm using the same class def names as found in the,
    # main scrapy spider and as defined in the items.py
    fileNamesCsv = ['ProductItemLoader','EmployeeLoader']

    def __init__(self):
        self.files = {}
        self.exporters = {}
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        self.files = dict([ (name, open("/projec_name"+name+'.csv','wb')) for name in self.fileNamesCsv ])
        for name in self.fileNamesCsv:
        self.exporters[name] = CsvItemExporter(self.files[name])

            if name == 'ProductItemLoader':
                self.exporters[name].fields_to_export = ['url','group_url','name','streetAddress','addressCountry','city','phone','summary','year','registry','legal_form','vat']
                self.exporters[name].start_exporting()

            if name == 'EmployeeLoader':
                self.exporters[name].fields_to_export = ['fax','website','brands','bank','import_area','import_countries','export_area','export_countries','employees','turn_over']
                self.exporters[name].start_exporting()

    def spider_closed(self, spider):
        [e.finish_exporting() for e in self.exporters.values()]
        [f.close() for f in self.files.values()]

    def process_item(self, item, spider):
        typesItem = item_type(item)
        if typesItem in set(self.fileNamesCsv):
            self.exporters[typesItem].export_item(item)
        return item

NeilR

NeilR
  • 46
  • 7
  • Can you state more clearly what is it that you are trying to achieve? You wrote that you want to save to two different files.. but I see that you only have one big items definition file.. just trying to understand what you are trying to accomplish here. – NeilR Jun 28 '18 at 22:07
  • I could write up another item definition class EmployeeLoader(scrapy.Item): but then how could I use it if parse returns one item definition load_item() and again how could i export it to csv fille ( EmployeeLoader ) – rafalf Jun 29 '18 at 07:20
  • Exactly, just split it into two item class definitions. Then, populate the different fields to use the two items classes and then using your pipeline, you can save the items to different csv files. If you can't visualize it, let me now and I can post a quick example for you – NeilR Jun 29 '18 at 16:00
  • I forgot to add, as long as I understand scrapy, do not use FEED_EXPORT_FIELDS as it won't allow you to achieve what you want. – NeilR Jun 29 '18 at 16:25
  • hmm... thanks for the example, the only question is now how do I register YourSitePipelineHere but I think I'll figure that out .. something like I guessITEM_PIPELINES = { 'xxxx.pipelines.YourSitePipelineHere ': 300 } – rafalf Jun 30 '18 at 17:52
  • I don't think you need to register it (double check first); if I remember correctly, your project would have a pipeline file defined, you just add the edits in it like in the example I posted. To use it, just import the items into your project from the main scrapy spider.. Look at the other site I suggested that you take a look.. All is there. – NeilR Jul 01 '18 at 20:34
1
#items.py
import scrapy
class JnuItem(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()

#pipelines.py
from itemadapter import ItemAdapter
from scrapy.exporters import CsvItemExporter


class SeminarPipeline:
    def __init__(self):
        self.file = None

    def open_spider(self,spider):        
        self.files={}
    
    def close_spider(self,spider):
        for exporter in self.files.values():
            exporter.finish_exporting()

    def file_name (self,item):
        adopter = ItemAdapter(item)
        title = adopter['title']
        string = str(title).lower()

        if 'webinar' in string:
            exporter = CsvItemExporter(open('webinar7.csv', 'ab'), include_headers_line=False)
            exporter.fields_to_export = ['date', 'title','link' ]
            exporter.start_exporting()
            self.files['webinar']=exporter
            return self.files['webinar']
    
        elif 'workshop' in string:
            exporter = CsvItemExporter(open('workshop7.csv', 'ab'), include_headers_line=False)
            exporter.fields_to_export = ['date', 'title', 'link']
            exporter.start_exporting()
            self.files['workshop'] = exporter
            return self.files['workshop']

        elif 'conference' in string:
            exporter = CsvItemExporter(open('conference7.csv', 'ab'), include_headers_line=False)
            exporter.fields_to_export = ['date', 'title', 'link']
            exporter.start_exporting()
            self.files['conference'] = exporter
            return self.files['conference']
    
                
        def process_item(self, item, spider):
            exporter = self.file_name(item)
            exporter.export_item(item)
            return item

#settings.py
ITEM_PIPELINES = {'seminar.pipelines.SeminarPipeline': 300,}
  • Please make sure to explain your answer, not just have code. This makes it easier for the OP and others who visit your post. – AlexH Oct 21 '20 at 02:44