I am trying to download all pdf files which contain scanned school books from a website. I tried using wget but it doesn't work. I suspect this is due to the website being an ASP-page with a selection options to select the course/year.
I also tried selecting a certain year/course and saving the html file locally but this doesn't work either
from bs4 import BeautifulSoup as bs
import urlopen
import wget
from urllib import parse as urlparse
def get_pdfs(my_url):
links = []
html = urlopen(my_url).read()
html_page = bs(html, features="lxml")
og_url = html_page.find("meta", property = "og:url")
base = urlparse(my_url)
print("base",base)
for link in html_page.find_all('a'):
current_link = link.get('href')
if current_link.endswith('pdf'):
if og_url:
print("currentLink",current_link)
links.append(og_url["content"] + current_link)
else:
links.append(base.scheme + "://" + base.netloc + current_link)
for link in links:
try:
wget.download(link)
except:
print(" \n \n Unable to Download A File \n")
my_url = 'https://www.svpo.nl/curriculum.asp'
get_pdfs(my_url)
my_url_local_html = r'C:\test\en_2.html' # downloaded year 2 english books page locally to extract pdf links
get_pdfs(my_url_local_html )
snipplet of my_url_local_html with links to pdf:
<li><a target="_blank" href="https://www.ib3.nl/curriculum/engels\010 TB 2 Ch 5.pdf">Chapter 5 - Going extreme</a></li>
<li><a target="_blank" href="https://www.ib3.nl/curriculum/engels\020 TB 2 Ch 6.pdf">Chapter 6 - A matter of taste</a></li>