Can somebody help me by telling me what is the error in my code?
I write "scrapy crawl provincia -o table_data_results.csv" in the cmd but the excel is empty. I think it isn't scraping anything.
from scrapy import Spider
from scrapy.http import FormRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '96',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
table = response.xpath('/html/body/form[1]/table[5]/tbody/tr/td/table/tbody/tr[1]/td/table')
trs= table.xpath('.//tr')[1:]
for tr in trs:
puerto_llegada= tr.xpath('.//td[0]/text()').extract_first().strip()
pais= tr.xpath('.//td[0]/text()').extract_first().strip()
bl= tr.xpath('.//td[2]/text()').extract_first().strip()
peso= tr.xpath('.//td[7]/text()').extract_first().strip()
bultos= tr.xpath('.//td[8]/text()').extract_first().strip()
consignatario= tr.xpath('.//td[11]/text()').extract_first().strip()
embarcador= tr.xpath('.//td[12]/text()').extract_first().strip()
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': peso,
'bultos': bultos,
'consignatario': consignatario,
'embarcador': embarcador}
EDIT: If I want to put this inside my code
links=tr.xpath('.//td[4]/text()')
yield response.follow(links.get(), callback= self.parse_categories)
def parse_categories(self, response):
tabla_des= response.xpath('/html/body/form//td[@class="beta"]/table')
trs3= tabla_des.xpath('.//tr')[1:]
for tr3 in trs3:
descripcion= tr.xpath('.//td[7]/text()').extract_first().strip()
and in the yield part I want it like this:
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': float("".join(peso.split(','))),
'bultos': float("".join(bultos.split(','))),
'consignatario': consignatario,
'embarcador': embarcador,
'descripcion': descripcion}
Where should I put it?