-2

I am looking to scrape a list of URLs -- I want to visit each one & then return all IMG links contained within each HREF on the page (in essence, visit each link and return the image address of the player headshot on each player profile).

I have a successful script for one set of URLs below - this is what I'm trying to achieve:

import requests
from bs4 import BeautifulSoup

import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)
# AddValue = ["Test", 25, "Test2"]
# worksheet.insert_row(AddValue, 3)


def get_links(url):
    data = []
    req_url = requests.get(url)
    soup = BeautifulSoup(req_url.content, "html.parser")

    for td in soup.find_all('td', {'data-th': 'Player'}):
        a_tag = td.a
        name = a_tag.text
        player_url = a_tag['href']
        print(f"Getting {name}")

        req_player_url = requests.get(
            f"https://basketball.realgm.com{player_url}")
        soup_player = BeautifulSoup(req_player_url.content, "html.parser")

        div_profile_box = soup_player.find('div', {'class': 'profile-box'})

        img_tag = div_profile_box.find('img')
        image_url = img_tag['src']

        row = {"Name": name, "URL": player_url, "Image URL": image_url}

        data.append(row)

    return data


urls = [
    'https://basketball.realgm.com/dleague/players/2022',
    'https://basketball.realgm.com/dleague/players/2021',
    'https://basketball.realgm.com/dleague/players/2020',
    'https://basketball.realgm.com/dleague/players/2019',
    'https://basketball.realgm.com/dleague/players/2018',
]


res = []
for url in urls:
    print(f"Getting: {url}")
    data = get_links(url)
    res = [*res, *data]

if res != []:
    header = list(res[0].keys())
    values = [
        header, *[[e[k] if e.get(k) else "" for k in header] for e in res]]
    worksheet.append_rows(values, value_input_option="USER_ENTERED")

This returns an output of: Player Name, Player URL, Player Headshot:

correct output

I tweaked the code to tweak for a different set of URLs, but it's not returning any information. No errors are showing, but nothing seems to be happening:

import requests
from bs4 import BeautifulSoup

import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)


def get_links(url):
    data = []
    req_url = requests.get(url)
    soup = BeautifulSoup(req_url.content, "html.parser")

    for td in soup.find_all('td', {'data-th': 'Player'}):
        a_tag = td.a
        name = a_tag.text
        player_url = a_tag['href']
        print(f"Getting {name}")

        req_player_url = requests.get(
            f"https://basketball.realgm.com{player_url}")
        soup_player = BeautifulSoup(req_player_url.content, "html.parser")

        div_profile_box = soup_player.find('div', {'class': 'profile-box'})

        img_tags = div_profile_box.find_all('img')
        for i, img_tag in enumerate(img_tags):
            image_url = img_tag['src']
            row = {"Name": name, "URL": player_url,
                   f"Image URL {i}": image_url}
            data.append(row)

    return data


urls = [
    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",
    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2",
    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3"
]

for url in urls:
    data = get_links(url)
    for row in data:
        worksheet.insert_row(list(row.values()))

I also checked a version debugging "soup_player", but I'm still not receiving any results:

import requests
from bs4 import BeautifulSoup

import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1TD4YmhfAsnSL_Fwo1lckEbnUVBQB6VyKC05ieJ7PKCw')
worksheet = sh.get_worksheet(0)


def get_links(url):
    data = []
    req_url = requests.get(url)
    soup = BeautifulSoup(req_url.content, "html.parser")

    for td in soup.find_all('td', {'data-th': 'Player'}):
        a_tag = td.a
        name = a_tag.text
        player_url = a_tag['href']
        print(f"Getting {name}")

        req_player_url = requests.get(
            f"https://basketball.realgm.com{player_url}")
        soup_player = BeautifulSoup(req_player_url.content, "html.parser")
        print(f"soup_player for {name}: {soup_player}")

        div_profile_box = soup_player.find('div', {'class': 'profile-box'})

        img_tags = div_profile_box.find_all('img')
        for i, img_tag in enumerate(img_tags):
            image_url = img_tag['src']
            row = {"Name": name, "URL": player_url, f"Image URL {i}": image_url}
            data.append(row)

    return data


urls = [    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc",    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2",    "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3"]

for url in urls:
    data = get_links(url)
    for row in data:
        worksheet.insert_row(list(row.values()))

Any advice as to what I may be doing wrong here? Thank you in advance!

Anthony Madle
  • 371
  • 1
  • 10
  • 1
    Have you done debug prints of `soup_player` to see whether you're actually reading anything? – Tim Roberts Feb 17 '23 at 04:38
  • @TimRoberts thank Tim, I added in what I had tried, but I wasn't getting any results for that - I'm not reading anything but not understanding why based off the site – Anthony Madle Feb 17 '23 at 04:47
  • "I'm not reading anything" -- Where? Have you printed `req.url.content` to make sure you can read those URLs at all? – Tim Roberts Feb 17 '23 at 05:10

1 Answers1

1

Always and first of all, take a look at your soup to see if all the expected ingredients are in place.


Main issue is that your selection of initial table elements will not find any, so ResultSet is empty.

Change:

for td in soup.find_all('td', {'data-th': 'Player'}):

to:

for td in soup.select('td:has(>a[href^="/player"])'):
HedgeHog
  • 22,146
  • 4
  • 14
  • 36
  • I ran into a separate issue, but going to try to diagnose first - seems to be looping through the same URL – Anthony Madle Feb 17 '23 at 15:15
  • Definitely, this is due to the loop that is supposed to extract the different images, which is why the URL of the profile appears several times. – HedgeHog Feb 17 '23 at 15:22
  • 1
    thank you very much - it takes me a little bit to figure these things out, so going to spend some time on it. hoping i can solve this one myself! – Anthony Madle Feb 17 '23 at 15:25