I'm trying to to scrape the public contact info of all the persons in each page of the website so I build 3 functions, one to modify the URL, one to extract the source code from it using BeautifulSoup and one to transform it and finally get the name, title, email, personal website and bio, but for some reason, I'm only getting back the first element of each page, it does covers the total amount of pages but it only scrapes the first person.
Here's how I wrote the code in that part
def paginas(pages):
pag = (f'https://www.hsph.harvard.edu/profiles/page/{pages}/')
return pag
def extract(pag):
url = requests.get(pag).text
soup = BeautifulSoup(url, 'lxml')
return soup
def transform(soup):
#principal = soup.find('div', class_ = 'hsph-bootstrap')
items = soup.find_all('div', class_ = 'grid-card grid-card-hover position-relative border rounded px-4 py-5')
for item in items:
try:
#name = item.find('a').text.strip() this is another way of getting it in this website
name = item.find('h2', class_ = 'h3 mb-0').text.strip() #siempre tendrá
except:
name = 'not given'
#Contact data.
website = item.find('h2', class_ = 'h3 mb-0').a['href']
main = item.find('div', class_ = 'grid-card-content')
bio = main.find('div', class_ = 'faculty-bio small').text.strip()
university = 'Harvard School of Public Health'
#INSIDE THE LINK
wd = webdriver.Chrome(options=options)
wd.get(website)
insideurl = requests.get(website).text
insidesoup = BeautifulSoup(insideurl, 'lxml')
#BIO DATA
insitem = insidesoup.find('div', class_ ='row rounded bg-white p-5')
try:
email = insitem.find('p', class_ = 'faculty-contact mb-2').text.strip()
except:
email = ''
try:
ti = insitem.find('div', class_ = 'faculty-bio')
title = ti.find('p').text
except:
ti = ''
title = ''
#EXTRA DATA ON BIO.
try:
bio2 = insidesoup.find('div', class_ = 'faculty-profile-container container mb-5')
complete = bio2.find('div', class_ = 'faculty-profile-overview-section').text.strip()
except:
bio2 = ''
complete = ''
contact = {
'name' : name,
'title' : title,
'university' : university,
'email' : email,
'website' : website,
'bio' : complete,
'area' : bio,
}
leadlist.append(lead)
return
leadlist =[]
for pages in range(1,127,1):
c = paginas(pages)
b = extract(c)
d = transform(b)
print(len(leadlist))