I need to scrape all headlines on autism topic from Le Monde newspaper's archive (from 1980). I'm not a programmer but humanitarian who is trying to be "digital"...
I managed to get a list of all (daily) issues and, from another side, parsing with soup a one url at time and extract headlines works as well. But both together don't. I fill my problem is on the parsing+iteration step but am not able to solve it.
from bs4 import BeautifulSoup
import requests
import re
from datetime import date, timedelta
start = date(2018, 1, 1)
end = date.today()
all_url =[]
#this chunk is working and returns a nice list of all url of all issues
day = timedelta(days=1)
one_url = "https://www.lemonde.fr/archives-du-monde/"
mydate = start
while mydate < end:
mydate += day
if one_url not in all_url:
all_url.append(one_url + "{date.day:02}/{date.month:02}/{date.year}".format(date=mydate) + '/')
#this function is working as well when applied with one single url
def titles(all_url):
for url in all_url:
page = BeautifulSoup(requests.get(url).text, "lxml")
regexp = re.compile(r'^.*\b(autisme|Autisme)\b.*$')
for headlines in page.find_all("h3"):
h = headlines.text
for m in regexp.finditer(h):
print(m.group())
titles(all_url)
This script is just stuck...