I want to scrap the link titles of google search for just 20 pages or so. I have tried this code just one day before and it was working! But today, it sends me 503 error.
I searched for the ways to solve this problem. Following is what I have tried.
- delay the time ( by inserting 'time.sleep(60)' code in line after 25.
- 'Fake User Agent' lib.
But still, watching 503 error.. This is the file.
import requests
from bs4 import BeautifulSoup
from collections import Counter
#google, '소프트웨어 교육'
base_google1_url = "https://www.google.co.kr/search?q=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&safe=active&ei=rv_RWYyaKcmW0gTqsa_IDg&start="
extra_google1_url="&sa=N&biw=958&bih=954"
#google, 'sw교육'
base_google2_url="https://www.google.co.kr/search?q=sw%EA%B5%90%EC%9C%A1&safe=active&ei=kLzUWYONLYa30QS4r5KACA&start="
extra_google2_url="&sa=N&biw=887&bih=950"
#book.naver, '소프트웨어 교육'
base_naver_url = "http://book.naver.com/search/search_in.nhn?query=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&&pattern=0&orderType=rel.desc&viewType=list&searchType=bookSearch&serviceSm=service.basic&title=&author=&publisher=&isbn=&toc=&subject=&publishStartDay=&publishEndDay=&categoryId=&qdt=1&filterType=0&filterValue=&serviceIc=service.author&buyAllow=0&ebook=0&page="
#from: https://docs.python.org/2/library/collections.html
cnt = Counter()
#bring search info
def get_html (site_name, content_num):
_html = ""
if site_name == 'google1':
google1_url = base_google1_url + str(content_num) + extra_google1_url
resp = requests.get(google1_url)
elif site_name == 'google2':
google2_url = base_google2_url + str(content_num) + extra_google2_url
resp = requests.get(google2_url)
elif site_name == 'naver':
naver_url = base_naver_url + str(content_num)
resp = requests.get(naver_url)
if resp.status_code == 200:
_html = resp.text
return _html
def word_count (name):
for content in name.contents:
words = content.split()
for word in words:
cnt[word] += 1
counting = cnt
return counting
def main():
cnt.clear()
counting = cnt
page_num = 0
#bring google '소프트웨어 교육' search info~~
while page_num < 20:
content_num = page_num*10
html = get_html("google1", content_num)
soup = BeautifulSoup(html, 'html.parser')
texts = soup.find_all('h3')
invalid_tag = ['b']
for text in texts:
for match in text.find_all(invalid_tag):
match.replaceWithChildren()
names = text.find_all('a')
for name in names:
counting = word_count(name)
page_num += 1
page_num = 0
#bring google 'sw교육' search info~~
while page_num < 20:
content_num = page_num*10
html = get_html("google2", content_num)
soup = BeautifulSoup(html, 'html.parser')
texts = soup.find_all('h3')
invalid_tag = ['b', 'a']
for text in texts:
for match in text.find_all(invalid_tag):
match.replaceWithChildren()
counting = word_count(text)
print(text)
page_num += 1
#bring naver book search info~~
page_num = 1
while page_num < 40:
html = get_html("naver", page_num)
soup = BeautifulSoup(html, 'html.parser')
texts = soup.find_all("dt")
invalid_tag = ['a','strong', 'span', 'img']
for text in texts:
for match in text.find_all(invalid_tag):
match.replaceWithChildren()
counting = word_count(text)
page_num += 1
#deleting useless keywords: if need to include len(k) == 1, instead of 'len(k) == 1 and ~ ' use following code --'or (len(k) == 1 and ord(k) >=33 and ord(k)<65)'
#https://stackoverflow.com/questions/8448202/remove-more-than-one-key-from-python-dict
del counting['소프트웨어'], counting['교육']
for key in [k for k in counting if len(k) == 1 or type(k) == int]: del counting[key]
count_20 = counting.most_common(20)
print(count_20)
if __name__ == '__main__':
main()
Please help me! Thank you in advance.