-2

I have debug this code, but still have some wrong, and i dont't know how to deal with this problem.

I already searched similar problem, but still have some question.

import requests
from bs4 import BeautifulSoup
import time
import json
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


url = 'http://www.zhihu.com'
loginURL = 'http://www.zhihu.com/login/email'

headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0)    Gecko/20100101 Firefox/41.0',
"Referer": "http://www.zhihu.com/",
'Host': 'www.zhihu.com',
}

data = {
'email': 'xxxxxxx1@gmail.com',
'password': 'xxxxxxx',
'rememberme': "true",
}

s = requests.session()

if os.path.exists('cookiefile'):
with open('cookiefile') as f:
    cookie = json.load(f)
s.cookies.update(cookie)
req1 = s.get(url, headers=headers)
with open('zhihu.html', 'w') as f:
    f.write(req1.content)

else:
req = s.get(url, headers=headers)
print req

soup = BeautifulSoup(req.text, "html.parser")
xsrf = soup.find('input', {'name': '_xsrf', 'type': 'hidden'}).get('value')

data['_xsrf'] = xsrf

timestamp = int(time.time() * 1000)
captchaURL = 'http://www.zhihu.com/captcha.gif?=' + str(timestamp)
print captchaURL

with open('zhihucaptcha.gif', 'wb') as f:
    captchaREQ = s.get(captchaURL)
    f.write(captchaREQ.content)
loginCaptcha = raw_input('input captcha:\n').strip()
data['captcha'] = loginCaptcha
print data
loginREQ = s.post(loginURL,  headers=headers, data=data)
print loginREQ.url
print s.cookies.get_dict()
with open('cookiefile', 'wb') as f:
    json.dump(s.cookies.get_dict(), f)

# http://www.zhihu.com/question/27621722/answer/48820436.
zanBaseURL = 'http://www.zhihu.com/answer/14926794/voters_profile?&offset={0}'
page = 0
count = 0
while 1:
zanURL = zanBaseURL.format(str(page))
page += 10
zanREQ = s.get(zanURL, headers=headers)
zanData = zanREQ.json()['payload']
if not zanData:
    break
for item in zanData:
    zansoup = BeautifulSoup(item, "html.parser").find(
        'a', {'target': "_blank", 'class': 'zg-link'})
    print 'nickname:', zansoup.get('title'),  '    ',
    print 'person_url:', zansoup.get('href')
    count += 1
print count

And error as follow:

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "D:\anzhuang\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 682, in runfile
    execfile(filename, namespace)
  File "D:\anzhuang\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 71, in execfile
    exec(compile(scripttext, filename, 'exec'), glob, loc)
  File "D:/python/crawl_zhihu.py", line 78, in <module>
    print 'nickname:', zansoup.get('title'),  '    ',
AttributeError: 'NoneType' object has no attribute 'get'
Andy
  • 49,085
  • 60
  • 166
  • 233
Joe.Herylee
  • 115
  • 1
  • 3
  • 13
  • 2
    Well, the error is pretty unambiguous: a `None` type doesn't have any attributes. So, it would appear that your object `zansoup` is the source of the error. Confirm that it has been correctly assigned? – David Zemens Oct 16 '15 at 14:55

1 Answers1

0

From the dox:

If find() can’t find anything, it returns None

This would raise the error expectedly, because a None type doesn't have any attributes.

So, you can handle the possible None type:

if zansoup:
    print 'nickname:', zansoup.get('title'),  '    ',
    print 'person_url:', zansoup.get('href')
else
    print 'there was an error ...'
David Zemens
  • 53,033
  • 11
  • 81
  • 130