I'm crawling through some directories with ASP.NET programming via Scrapy.
The pages to crawl through are encoded as such:
javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')
where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page. I've written down some codes below which can only extract each link within the first page.
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
from scrapy.http import FormRequest
import js2xml
import requests
from datetime import datetime
class nnggzySpider(scrapy.Spider):
name = 'nnggzygov'
start_urls = [
'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'
]
base_url = 'https://www.nnggzy.org.cn'
custom_settings = {
'LOG_LEVEL': 'ERROR'
}
def parse(self, response):
_response = response.text
self.data = {}
soup = BeautifulSoup(response.body, 'html.parser')
tags = soup.find_all('a', href=re.compile(r"InfoDetail"))
# 获取翻页参数
__VIEWSTATE = re.findall(r'id="__VIEWSTATE" value="(.*?)" />', _response)
A = __VIEWSTATE[0]
# print(A)
__EVENTTARGET = 'MoreInfoListZbgs1$Pager'
B = __EVENTTARGET
__CSRFTOKEN = re.findall(r'id="__CSRFTOKEN" value="(.*?)" />', _response)
C = __CSRFTOKEN
page_num = re.findall(r'title="转到第(.*?)页"', _response)
max_page = page_num[-1]
content = {
'__VIEWSTATE': A,
'__EVENTTARGET': B,
'__CSRFTOKEN': C,
'page_num': max_page
}
infoid = re.findall(r'InfoID=(.*?)&CategoryNum', _response)
print(infoid)
yield scrapy.Request(url=response.url, callback=self.parse_detail, meta={"data": content})
def parse_detail(self, response):
max_page = response.meta['data']['page_num']
for i in range(2, int(max_page)):
data = {
'__CSRFTOKEN': '{}'.format(response.meta['data']['__CSRFTOKEN']),
'__VIEWSTATE': '{}'.format(response.meta['data']['__VIEWSTATE']),
'__EVENTTARGET': 'MoreInfoListZbgs1$Pager',
'__EVENTARGUMENT': '{}'.format(i),
# '__VIEWSTATEENCRYPTED': '',
# 'txtKey': ''
}
yield scrapy.FormRequest(url=response.url, callback=self.parse, formdata=data, method="POST", dont_filter=True)
Can anyone help me with this?