I am looking to scrape a list of URLs -- I want to visit each one & then return all IMG links contained within each HREF on the page (in essence, visit each link and return the image address of the player headshot on each player profile).
I have a successful script for one set of URLs below - this is what I'm trying to achieve:
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)
# AddValue = ["Test", 25, "Test2"]
# worksheet.insert_row(AddValue, 3)
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th': 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find('div', {'class': 'profile-box'})
img_tag = div_profile_box.find('img')
image_url = img_tag['src']
row = {"Name": name, "URL": player_url, "Image URL": image_url}
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
'https://basketball.realgm.com/dleague/players/2019',
'https://basketball.realgm.com/dleague/players/2018',
]
res = []
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
res = [*res, *data]
if res != []:
header = list(res[0].keys())
values = [
header, *[[e[k] if e.get(k) else "" for k in header] for e in res]]
worksheet.append_rows(values, value_input_option="USER_ENTERED")
This returns an output of: Player Name, Player URL, Player Headshot:
I tweaked the code to tweak for a different set of URLs, but it's not returning any information. No errors are showing, but nothing seems to be happening:
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th': 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find('div', {'class': 'profile-box'})
img_tags = div_profile_box.find_all('img')
for i, img_tag in enumerate(img_tags):
image_url = img_tag['src']
row = {"Name": name, "URL": player_url,
f"Image URL {i}": image_url}
data.append(row)
return data
urls = [
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2",
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3"
]
for url in urls:
data = get_links(url)
for row in data:
worksheet.insert_row(list(row.values()))
I also checked a version debugging "soup_player", but I'm still not receiving any results:
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th': 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
print(f"soup_player for {name}: {soup_player}")
div_profile_box = soup_player.find('div', {'class': 'profile-box'})
img_tags = div_profile_box.find_all('img')
for i, img_tag in enumerate(img_tags):
image_url = img_tag['src']
row = {"Name": name, "URL": player_url, f"Image URL {i}": image_url}
data.append(row)
return data
urls = [ "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc", "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2", "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3"]
for url in urls:
data = get_links(url)
for row in data:
worksheet.insert_row(list(row.values()))
Any advice as to what I may be doing wrong here? Thank you in advance!