I am trying to web scrape coronavirus related articles from a news website. However, I get HTTPError
error. The same error also shows for other news portals. The code works for a different website though. I have asked a different question with similar codes in this post. Some previous answers to similar problems ask to change user-agent
, but it does not still work after inserting headers = {'User-Agent': 'Mozilla/5.0'}
along with the URL. This could be probably because I did not use the code properly. Any help would be much appreciated.
Here is the code I have used:
import urllib.request
import newspaper
from newspaper import Article
import csv, os
from bs4 import BeautifulSoup
import urllib
req_keywords = ['coronavirus', 'covid-19']
newspaper_base_url = 'https://thehimalayantimes.com/'
category = 'nepal'
def checkif_kw_exist(list_one, list_two):
common_kw = set(list_one) & set(list_two)
if len(common_kw) == 0: return False, common_kw
else: return True, common_kw
def get_article_info(url):
a = Article(url)
try:
a.download()
a.parse()
a.nlp()
success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
if success:
return [url, a.publish_date, a.title, a.text]
else: return False
except:
return False
output_file = "J:/B/output_nepal.csv"
if not os.path.exists(output_file):
open(output_file, 'w').close()
for index in range(1,3700,1):
page_url = newspaper_base_url + '/' + category + '?page='+str(index)
page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})
for tag in primary_tag:
url = tag.find("a")
url = newspaper_base_url + url.get('href')
result = get_article_info(url)
if result is not False:
with open(output_file, 'a', encoding='utf-8') as f:
writeFile = csv.writer(f)
writeFile.writerow(result)
f.close
else:
pass
Here is the error I get:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-34-c9c043bb59fb> in <module>
69 page_url = newspaper_base_url + '/' + category + '?page='+str(index)
70
---> 71 page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
72
73 primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden