3

Instead of the output being 10 links on each page, it is only returning the ten links on the last page. In other words, if this was working, the total number of links would be 200.

from goose3 import Goose
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re

for x in range(1,20):
    numb = str(x)
    req = Request("https://search.crossref.org/?q=north+korea&page=" + numb)
    html_page = urlopen(req)    


soup = BeautifulSoup(html_page, 'lxml')
print(soup)
links = []
 
for link in soup.findAll('a', attrs={'href': re.compile("^https://doi")}):
    links.append(link.get('href'))
 
print(links)

for ft in links:
    try:
        url = ft
        g = Goose()
        article = g.extract(url=url)
        m = article.cleaned_text
        print(m)
        print("⸻⸻⸻⸻⸻⸻⸻⸻")
    except requests.exceptions.ConnectionError as e:
        pass

Output: ['https://doi.org/10.1057/9781137347633.0021', 'https://doi.org/10.4135/9781412939607.n388', 'https://doi.org/10.4135/9781412953924.n601', 'https://doi.org/10.4324/9780203164365', 'https://doi.org/10.1787/eco_surveys-kor-2018-4-en', 'https://doi.org/10.21236/ada523754', 'https://doi.org/10.21236/ada441640', 'https://doi.org/10.21236/ada441540', 'https://doi.org/10.21236/ada560116', 'https://doi.org/10.1787/888932592489']

zbush548
  • 244
  • 1
  • 10

1 Answers1

1
import requests
from bs4 import BeautifulSoup

params = {
    'q': 'north korea'
}


def main(url):
    with requests.Session() as req:
        allin = []
        for page in range(1, 21):
            print(f"Extracting Page# {page}")
            params['page'] = page
            r = req.get(url, params=params)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = [x.a['href'] for x in soup.select("div.item-links")]
            allin.extend(target)
        print(allin)


main("https://search.crossref.org/")