Instead of the output being 10 links on each page, it is only returning the ten links on the last page. In other words, if this was working, the total number of links would be 200.
from goose3 import Goose
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
for x in range(1,20):
numb = str(x)
req = Request("https://search.crossref.org/?q=north+korea&page=" + numb)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, 'lxml')
print(soup)
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^https://doi")}):
links.append(link.get('href'))
print(links)
for ft in links:
try:
url = ft
g = Goose()
article = g.extract(url=url)
m = article.cleaned_text
print(m)
print("⸻⸻⸻⸻⸻⸻⸻⸻")
except requests.exceptions.ConnectionError as e:
pass
Output: ['https://doi.org/10.1057/9781137347633.0021', 'https://doi.org/10.4135/9781412939607.n388', 'https://doi.org/10.4135/9781412953924.n601', 'https://doi.org/10.4324/9780203164365', 'https://doi.org/10.1787/eco_surveys-kor-2018-4-en', 'https://doi.org/10.21236/ada523754', 'https://doi.org/10.21236/ada441640', 'https://doi.org/10.21236/ada441540', 'https://doi.org/10.21236/ada560116', 'https://doi.org/10.1787/888932592489']