0

I'm trying to scrape multiple tables which have a table name stored under a h3 tag. There is Columns of data I can scrape no problem and when I feed the next url I can append this data to the csv file. The problem I can't solve is to get the table header and store this relative to each row of table. The reason for this is when the next table is fed I need to know which table it belongs. Is it possible to use a len loop on we say 'Round' to establish the table length and then write the table header to each row? is it possible to do with item exports?

Here's my code spider.py

from bigcrawler.items import BigcrawlerItem
from scrapy import Spider, Request, Selector

from scrapy.selector import Selector

from bigcrawler.items import MatchStatItemLoader

class CrawlbotSpider(Spider):
   name = 'bigcrawler'
   allowed_domains = ['www.matchstat.com']
   start_urls =   [ 

      'https://matchstat.com/tennis/tournaments/w/Taipei/2015',        
      'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
                ]
def parse_header(self , response):
    hxs = Selector(response)
    for tb in hxs.css('tr.match'):

      heading = tb.xpath('//*[@id="AWS"]/div/h3/text()').extract()[0]

        for td in tb.xpath(".//tr[contains(@class, 
                    'match')]/td[contains(@class, 'round')]/text()"):
            il = BigcrawlerItem(selector=td)

            il.add_value('event_title' , heading)
            yield il.load_item()


 def parse(self , response):
    for row in response.css('tr.match'):
        il = MatchStatItemLoader(selector=row)
        il.add_css('round' , '.round::text')
        il.add_css('event1' , '.event-name a::text')
        il.add_css('player_1' , '.player-name:nth-child(2) a::text')
        il.add_css('player_2' , '.player-name:nth-child(3) a::text')
        il.add_css('player_1_odds' , '.odds-td.odds-0 
       [payout]::text')
        il.add_css('player_2_odds' , '.odds-td.odds-1 
       [payout]::text')
        il.add_css('h_2_h' , 'a.h2h::text')
        yield il.load_item()

items.py

   import scrapy

   from scrapy.loader import ItemLoader
   from scrapy.loader.processors import TakeFirst, MapCompose
   from operator import methodcaller
   from scrapy import Spider, Request, Selector

   class BigcrawlerItem(scrapy.Item):
  # define the fields for your item here like:

       event_title = scrapy.Field()

       round = scrapy.Field()

       event1 = scrapy.Field()

      player_1 = scrapy.Field()

      player_2 = scrapy.Field()

      player_1_odds = scrapy.Field()

     player_2_odds = scrapy.Field()

     h_2_h = scrapy.Field()


  class MatchStatItemLoader(ItemLoader):
  default_item_class = BigcrawlerItem
  default_input_processor = MapCompose(methodcaller('strip'))
  default_output_processor = TakeFirst()
tomoc4
  • 337
  • 2
  • 10
  • 29

2 Answers2

1

If there's only one of those headings you don't need to be relative to the current node, try this:

il.add_xpath('event_title', '//*[@id="AWS"]//h3/text()')

but if you need it to be relative to the current node you could also do this:

il.add_xpath('event_title', './ancestor::*[@id="AWS"]//h3/text()')
Wilfredo
  • 1,548
  • 1
  • 9
  • 9
0

I would suggest completely not use Items class and use start_requests method instead of start_urls as they are really confusing. See the fully working code here. Also notice the match_heading variable.

class CrawlbotSpider(Spider):
   name = 'bigcrawler'
   allowed_domains = ['www.matchstat.com']
   start_urls =   [ 

      'https://matchstat.com/tennis/tournaments/w/Taipei/2015',        
      'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
                ]

    def start_requests(self):
       match_urls =   [ 

          'https://matchstat.com/tennis/tournaments/w/Taipei/2015',        
          'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
                    ]

        for url in match_urls:

            yield Request(url=url, callback=self.parse_matches)



    def parse_matches(self , response):
        match_heading = response.xpath('//*[@id="AWS"]/div/h3/text()').extract_first()

        for row in response.css('tr.match'):

            match = {}
            match['heading'] = match_heading
            match['round'] = row.css(".round::text").extract_first()
            match['event1'] = row.css(".event-name a::text").extract_first()
            match['player_1'] = row.css(".player-name:nth-child(2) a::text").extract_first()
            match['player_2'] = row.css(".player-name:nth-child(3) a::text").extract_first()
            match['player_1_odds'] = row.css(".odds-td.odds-0 [payout]::text").extract_first()
            match['player_2_odds'] = row.css(".odds-td.odds-1 [payout]::text").extract_first()
            match['h_2_h'] = row.css("a.h2h::text::text").extract_first()

            yield match
Umair Ayub
  • 19,358
  • 14
  • 72
  • 146