This is my code:
import scrapy
from scrapy import Spider
from scrapy.http import FormRequest
class ProvinciaSpider(Spider):
name = 'provincia'
allowed_domains = ['aduanet.gob.pe']
start_urls = ['http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias?accion=cargaConsultaManifiesto&tipoConsulta=salidaProvincia']
def parse(self, response):
data ={ 'accion': 'consultaManifExpProvincia',
'salidaPro': 'YES',
'strMenu': '-',
'strEmpTransTerrestre': '-',
'CMc1_Anno': '2022',
'CMc1_Numero': '96',
'CG_cadu': '046',
'viat': '1'}
yield FormRequest('http://www.aduanet.gob.pe/cl-ad-itconsmanifiesto/manifiestoITS01Alias', formdata=data, callback=self.parse_form_page)
def parse_form_page(self, response):
table = response.xpath('/html/body/form[1]//td[@class="beta"]/table')
trs = table.xpath('.//tr')[1:]
for tr in trs:
puerto_llegada= tr.xpath('.//td[1]/text()').extract_first().strip()
pais= tr.xpath('.//td[1]/text()').extract_first().strip()
bl= tr.xpath('.//td[3]/text()').extract_first().strip()
peso= tr.xpath('.//td[8]/text()').extract_first().strip()
bultos= tr.xpath('.//td[9]/text()').extract_first().strip()
consignatario= tr.xpath('.//td[12]/text()').extract_first().strip()
embarcador= tr.xpath('.//td[13]/text()').extract_first().strip()
links=tr.xpath('.//td[4]/a/@href')
yield response.follow(links.get(),
callback=self.parse_categories,
meta={'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': float("".join(peso.split(','))),
'bultos': float("".join(bultos.split(','))),
'consignatario': consignatario,
'embarcador': embarcador})
def parse_categories(self, response):
puerto_llegada = response.meta['puerto_llegada']
pais = response.meta['pais']
bl = response.meta['bl']
peso = response.meta['peso']
bultos = response.meta['bultos']
consignatario = response.meta['consignatario']
embarcador = response.meta['embarcador']
tabla_des= response.xpath('/html/body/form//td[@class="beta"]/table')
trs3= tabla_des.xpath('.//tr')[1:]
for tr3 in trs3:
descripcion= tr.xpath('.//td[7]/text()').extract_first().strip()
yield {'puerto_llegada': puerto_llegada,
'pais': pais,
'bl': bl,
'peso': PROCESOS,
'bultos': bultos,
'consignatario': consignatario,
'embarcador': embarcador,
'descripcion': descripcion}
And I get this error:
ValueError: Missing scheme in request url: javascript:jsDetalle2('154');
Every link that I want to extract data from has that format, so my code for extracting the data inside each link doesn't work.
The link format is like javascript:jsDetalle2('154'), only the numbers change.
The problem is that it isn't http//........ or /manifiesto...... in the first case you only have to follow the link and that's all, in the second case you have to join the second part of the URL with the first response URL. But this case is none, so I don't know how to make it work.
How can I write it in order to work?