3

I'm crawling through some directories with ASP.NET programming via Scrapy.

The pages to crawl through are encoded as such:

javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')

where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page. I've written down some codes below which can only extract each link within the first page.

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
from scrapy.http import FormRequest
import js2xml
import requests
from datetime import datetime


class nnggzySpider(scrapy.Spider):

    name = 'nnggzygov'
    start_urls = [
        'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'
    ]

    base_url = 'https://www.nnggzy.org.cn'


    custom_settings = {
        'LOG_LEVEL': 'ERROR'
    }

    def parse(self, response):
        _response = response.text
        self.data = {}
        soup = BeautifulSoup(response.body, 'html.parser')
        tags = soup.find_all('a', href=re.compile(r"InfoDetail"))

        # 获取翻页参数
        __VIEWSTATE = re.findall(r'id="__VIEWSTATE" value="(.*?)" />', _response)
        A = __VIEWSTATE[0]
        # print(A)
        __EVENTTARGET = 'MoreInfoListZbgs1$Pager'
        B = __EVENTTARGET
        __CSRFTOKEN = re.findall(r'id="__CSRFTOKEN" value="(.*?)" />', _response)
        C = __CSRFTOKEN
        page_num = re.findall(r'title="转到第(.*?)页"', _response)
        max_page = page_num[-1]

        content = {
            '__VIEWSTATE': A,
            '__EVENTTARGET': B,
            '__CSRFTOKEN': C,
            'page_num': max_page
        }
        infoid = re.findall(r'InfoID=(.*?)&CategoryNum', _response)
        print(infoid)
        yield scrapy.Request(url=response.url, callback=self.parse_detail, meta={"data": content})

    def parse_detail(self, response):
        max_page = response.meta['data']['page_num']
        for i in range(2, int(max_page)):
            data = {
                '__CSRFTOKEN': '{}'.format(response.meta['data']['__CSRFTOKEN']),
                '__VIEWSTATE': '{}'.format(response.meta['data']['__VIEWSTATE']),
                '__EVENTTARGET': 'MoreInfoListZbgs1$Pager',
                '__EVENTARGUMENT': '{}'.format(i),
                # '__VIEWSTATEENCRYPTED': '',
                # 'txtKey': ''
            }
            yield scrapy.FormRequest(url=response.url, callback=self.parse, formdata=data, method="POST", dont_filter=True)

Can anyone help me with this?

Lance Liao
  • 31
  • 3

2 Answers2

1

Looks like the pagination over mentioned website is made by sending POST requests with formdata like:

{
    "__CSRFTOKEN": ...,
    "__VIEWSTATE": ...,
    "__EVENTTARGET": "MoreInfoListZbgs1$Pager",
    "__EVENTARGUMENT": page_number,
    "__VIEWSTATEENCRYPTED": "",
    "txtKey": ""
}
Michael Savchenko
  • 1,445
  • 1
  • 9
  • 13
  • Thanks for your findings. I've edited my code in which I tried send POST requests with the formdata but not working. – Lance Liao Mar 11 '21 at 23:55
0

I know this is a year old thread but I am posting the answer for future visitors from Google search.

Your form submission didn't work because there must be some more hidden fields at the bottom of the web page but inside the form. In my case, it is and here's the working submission

# This is the next page link
# <a id="nextId" href="javascript:__doPostBack('MoreInfoListZbgs1$Pager','')"> Next </a>

# This is how the website evaluate the next link
# <script type="text/javascript">
# //<![CDATA[
# var theForm = document.forms['Form1'];
# if (!theForm) {
#     theForm = document.Form1;
# }
# function __doPostBack(eventTarget, eventArgument) {
#     if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
#         theForm.__EVENTTARGET.value = eventTarget;
#         theForm.__EVENTARGUMENT.value = eventArgument;
#         theForm.submit();
#     }
# }
# //]]>
# </script>

# According to above js code, we need to pass in the following arguments:
data = {
    '__EVENTTARGET': 'MoreInfoListZbgs1$Pager', # first argument from javascript:__doPostBack('MoreInfoListZbgs1$Pager','') next link
    '__EVENTARGUMENT': '', # second argument from javascript:__doPostBack('MoreInfoListZbgs1$Pager','') next link, in my case it is empty
    '__VIEWSTATE': response.css('input[name=__VIEWSTATE]::attr("value")').get(),

    #  These are the more hidden input fields you need to pass in
    '__VIEWSTATEGENERATOR': response.css('input[name=__VIEWSTATEGENERATOR]::attr("value")').get(),
    '__EVENTVALIDATION': response.css('input[name=__EVENTVALIDATION]::attr("value")').get(),
}

yield scrapy.FormRequest(url=form_action_url_here, formdata=data, callback=self.parse)
Myo Win
  • 483
  • 1
  • 6
  • 17