0

I am trying to web scrape coronavirus related articles from a news website. However, I get HTTPError error. The same error also shows for other news portals. The code works for a different website though. I have asked a different question with similar codes in this post. Some previous answers to similar problems ask to change user-agent, but it does not still work after inserting headers = {'User-Agent': 'Mozilla/5.0'} along with the URL. This could be probably because I did not use the code properly. Any help would be much appreciated.

Here is the code I have used:

import urllib.request
import newspaper
from newspaper import Article
import csv, os
from bs4 import BeautifulSoup
import urllib

req_keywords = ['coronavirus', 'covid-19']

newspaper_base_url = 'https://thehimalayantimes.com/'
category = 'nepal'

def checkif_kw_exist(list_one, list_two):
    common_kw = set(list_one) & set(list_two)
    if len(common_kw) == 0: return False, common_kw
    else: return True, common_kw

def get_article_info(url):
    a = Article(url)
    try:
        a.download()
        a.parse()
        a.nlp()
        success, checked_kws = checkif_kw_exist(req_keywords, a.text.split())
        if success:
            return [url, a.publish_date, a.title, a.text]
        else: return False
    except:
        return False

output_file = "J:/B/output_nepal.csv"
if not os.path.exists(output_file):
    open(output_file, 'w').close() 

for index in range(1,3700,1):
    page_url = newspaper_base_url + '/' + category + '?page='+str(index)

    page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())

    primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})

    for tag in primary_tag:

        url = tag.find("a")
        url = newspaper_base_url + url.get('href')
        result = get_article_info(url)
        if result is not False:
            with open(output_file, 'a', encoding='utf-8') as f:
                writeFile = csv.writer(f)
                writeFile.writerow(result)
                f.close
        else: 
            pass

Here is the error I get:

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-34-c9c043bb59fb> in <module>
     69     page_url = newspaper_base_url + '/' + category + '?page='+str(index)
     70 
---> 71     page_soup = BeautifulSoup( urllib.request.urlopen(page_url).read())
     72 
     73     primary_tag = page_soup.find_all("h4", attrs={"class": "pad-bottom-small"})

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    530         for processor in self.process_response.get(protocol, []):
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 
    534         return response

~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
    640         if not (200 <= code < 300):
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 
    644         return response

~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
    568         if http_err:
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 
    572 # XXX probably also want an abstract factory that knows when it makes

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    648 class HTTPDefaultErrorHandler(BaseHandler):
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 
    652 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 403: Forbidden
crackers
  • 327
  • 2
  • 12
  • Try adding a delay between the requests, and using a `requests.Session` object. _Some previous answers to similar problems ask to change `user-agent`, but it does not work still after inserting_ IIUC you’re supposed to be constantly changing the User-Agent. – AMC May 02 '20 at 01:38
  • @AMC Thank you for responding. Do you mean I should add `urllib.request.Session`? I am not familiar with this method. Could you clarify a bit, please? – crackers May 02 '20 at 01:54
  • @crackers Sorry, I was referring to the class in the [Requests](https://requests.readthedocs.io/en/master/) library. I almost never use urllib directly, so I’m not sure what the equivalent would be. – AMC May 02 '20 at 01:57
  • Using the `requests` library I am able to fetch https://thehimalayantimes.com/nepal?page=1 sucessfully, using `Mozilla/5.0` as the user-agent header. – larsks May 02 '20 at 01:58
  • Yes, now it seems like the code is running after using `Mozilla/5.0` but the output or news articles are not being downloaded to the csv file. – crackers May 02 '20 at 02:18

0 Answers0