I'm trying to scrape multiple tables which have a table name stored under a h3 tag. There is Columns of data I can scrape no problem and when I feed the next url I can append this data to the csv file. The problem I can't solve is to get the table header and store this relative to each row of table. The reason for this is when the next table is fed I need to know which table it belongs. Is it possible to use a len loop on we say 'Round' to establish the table length and then write the table header to each row? is it possible to do with item exports?
Here's my code spider.py
from bigcrawler.items import BigcrawlerItem
from scrapy import Spider, Request, Selector
from scrapy.selector import Selector
from bigcrawler.items import MatchStatItemLoader
class CrawlbotSpider(Spider):
name = 'bigcrawler'
allowed_domains = ['www.matchstat.com']
start_urls = [
'https://matchstat.com/tennis/tournaments/w/Taipei/2015',
'https://matchstat.com/tennis/tournaments/w/Hong%20Kong/2017',
]
def parse_header(self , response):
hxs = Selector(response)
for tb in hxs.css('tr.match'):
heading = tb.xpath('//*[@id="AWS"]/div/h3/text()').extract()[0]
for td in tb.xpath(".//tr[contains(@class,
'match')]/td[contains(@class, 'round')]/text()"):
il = BigcrawlerItem(selector=td)
il.add_value('event_title' , heading)
yield il.load_item()
def parse(self , response):
for row in response.css('tr.match'):
il = MatchStatItemLoader(selector=row)
il.add_css('round' , '.round::text')
il.add_css('event1' , '.event-name a::text')
il.add_css('player_1' , '.player-name:nth-child(2) a::text')
il.add_css('player_2' , '.player-name:nth-child(3) a::text')
il.add_css('player_1_odds' , '.odds-td.odds-0
[payout]::text')
il.add_css('player_2_odds' , '.odds-td.odds-1
[payout]::text')
il.add_css('h_2_h' , 'a.h2h::text')
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from operator import methodcaller
from scrapy import Spider, Request, Selector
class BigcrawlerItem(scrapy.Item):
# define the fields for your item here like:
event_title = scrapy.Field()
round = scrapy.Field()
event1 = scrapy.Field()
player_1 = scrapy.Field()
player_2 = scrapy.Field()
player_1_odds = scrapy.Field()
player_2_odds = scrapy.Field()
h_2_h = scrapy.Field()
class MatchStatItemLoader(ItemLoader):
default_item_class = BigcrawlerItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()