How can I use the Newspaper library for websites that need authentication? I'm using the newspaper3k library in order to download the html of several articles from different news sites (which is so far working just fine). However, as I need the full content I would need to authenticate (username, password) before requesting the html. I would appreciate any pointers in the right direction!
I assume this has to happen before I use newspaper.build() ?
(I just wanted to say at this point, this is the first time I am coding with python (or just generally coding anything) so any help at all would be great)
import newspaper #import newspaper library
from newspaper import news_pool
guardian = newspaper.build('https://www.theguardian.com/uk-news/all', language='en', memoize_articles=True)
telegraph = newspaper.build('https://www.telegraph.co.uk/news/uk/', language='en', memoize_articles=True)
dagbladet = newspaper.build('https://www.svd.se/sverige', language='sv', memoize_articles=True)
dagensnyheter = newspaper.build('https://www.dn.se/nyheter/sverige/', language='sv', memoize_articles=True)
allpapers = [guardian, telegraph, dagbladet, dagensnyheter]
for papers in allpapers:
newpathpaper = r'/Users/articles/' + today + "/" + naming #naming is just a variable from further up that gives the name of each newspaper
if not os.path.exists(newpathpaper):
os.makedirs(newpathpaper)
#parsing, downloading and creating files for articles
pointer = 0
while(papers.size() > pointer):
papers_article = papers.articles[pointer]
papers_article.download()
if papers_article.download_state == 2: #checking if article has been downloaded
time.sleep(2)
papers_article.parse()
print(papers_article.url)
#receiving publishing date so it is comparable
published_today = papers_article.publish_date #newspaper extractor
published = str(published_today)[0:10]
#writing html
if published == today: #today was declared earlier
f = open('articles/%s/%s/%s_article_%s.html' %(today, naming, naming, pointer), 'w+') #writing html file
f.write(papers_article.html)
print("written successfully")
count_writes +=1
else:
print("not from today")
else:
print("article %s" %pointer)
print(papers_article.url)
print("Has not downloaded!")
pointer += 1