0

I'm trying to to scrape the public contact info of all the persons in each page of the website so I build 3 functions, one to modify the URL, one to extract the source code from it using BeautifulSoup and one to transform it and finally get the name, title, email, personal website and bio, but for some reason, I'm only getting back the first element of each page, it does covers the total amount of pages but it only scrapes the first person.

Here's how I wrote the code in that part

def paginas(pages):
  pag = (f'https://www.hsph.harvard.edu/profiles/page/{pages}/')
  return pag

def extract(pag):
  url = requests.get(pag).text
  soup = BeautifulSoup(url, 'lxml')
  return soup

def transform(soup):
  #principal = soup.find('div', class_ = 'hsph-bootstrap')
  items = soup.find_all('div', class_ = 'grid-card grid-card-hover position-relative border rounded px-4 py-5')
  for item in items:
      try:
        #name = item.find('a').text.strip() this is another way of getting it in this website
        name = item.find('h2', class_ = 'h3 mb-0').text.strip() #siempre tendrá
      except: 
        name = 'not given'
      #Contact data. 
      website = item.find('h2', class_ = 'h3 mb-0').a['href']
      main = item.find('div', class_ = 'grid-card-content')
      bio = main.find('div', class_ = 'faculty-bio small').text.strip()
      university = 'Harvard School of Public Health'
      #INSIDE THE LINK
      wd = webdriver.Chrome(options=options)
      wd.get(website)
      insideurl = requests.get(website).text
      insidesoup = BeautifulSoup(insideurl, 'lxml')
      #BIO DATA
      insitem = insidesoup.find('div', class_ ='row rounded bg-white p-5')
      try:
        email = insitem.find('p', class_ = 'faculty-contact mb-2').text.strip()
      except: 
        email = ''
      try:
        ti = insitem.find('div', class_ = 'faculty-bio')
        title = ti.find('p').text
      except: 
        ti = ''
        title = ''
      #EXTRA DATA ON BIO. 
      try:
        bio2 = insidesoup.find('div', class_ = 'faculty-profile-container container mb-5')
        complete = bio2.find('div', class_ = 'faculty-profile-overview-section').text.strip()
      except: 
        bio2 = ''
        complete = ''

      contact = {
          'name' : name,
          'title' : title,
          'university' : university,
          'email' : email,
          'website' : website,
          'bio' : complete,
          'area' : bio,
      }
      leadlist.append(lead)
      return

leadlist =[]


for pages in range(1,127,1):
    c = paginas(pages)
    b = extract(c)
    d = transform(b) 

print(len(leadlist))
TylerH
  • 20,799
  • 66
  • 75
  • 101

0 Answers0