0

I want to scrape the data from the table that is dynamically loaded inside this website.

I have tried the requests and selenium package in Python but seems like I have not implemented the right way.


driver = webdriver.Chrome()
driver.get(http://live.xacte.com/lamarathon/)
element = driver.find_element(by=By.ID, value= 'xact_results_tabs_results')

I can't find the source of the data in the html file when I download too.

Please help me to look into the website and advice me what to do to scrape this data. Thank you!

3 Answers3

0

This should be enough to get you started. This just dumps all the text contents of each row as comma separated values.

driver = webdriver.Chrome()
driver.get("http://live.xacte.com/lamarathon")
rows = new WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located(By.CSS_SELECTOR, "#xact_results_search_results tr"))
for row in rows:
    row_text = []
    for cell in row.find_elements(By.CSS_SELECTOR("td")):
        row_text.append(cell.text)
    print(','.join(row_text)
JeffC
  • 22,180
  • 5
  • 32
  • 55
0

To collect all the data, you will need to connect the BeautifulSoup library. To begin with, to reduce the number of iterations, display 100 elements on the page. To do this, click on the drop-down list, and then on option 100. Then in the loop we sort through all the found elements and output them, or save them, it's up to you. At each iteration, we find the NEXT button and click on it to load new elements. I left a comment in the code to understand the process.

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

import time

url = 'http://live.xacte.com/lamarathon/'

chrome_options = Options()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=chrome_options, executable_path=".../chromedriver_linux64/chromedriver") # the path to your chromedriver
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.NAME, "xact_results_search_results_length")))
dropdown = driver.find_element(By.NAME, "xact_results_search_results_length").click() # click on the drop-down list
options = driver.find_elements(By.TAG_NAME, "option")
for option in options:
    if option.text == '100':
        option.click() # click on the option 100
        break
time.sleep(2) # waiting for page loading
for i in range(272): # number of iterations by number of data pages
    time.sleep(1) # waiting for page loading
    html_content = driver.page_source

    soup = BeautifulSoup(html_content, 'html.parser')

    tbody = soup.find('tbody', role='alert') # getting the data
    trs = tbody.find_all('tr')
    for tr in trs:
        tds = tr.find_all('td') # output data by referring to the list items by position
        print(f'Bib - {tds[0].text}')
        print(f'Name - {tds[1].text}')
        print(f'Sex - {tds[2].text}')
        print(f'Age - {tds[3].text}')
        print(f'City - {tds[4].text}')
        print(f'Country - {tds[5].text}')
        print(f'Net - {tds[6].text}')
        print(f'Clock - {tds[7].text}')
        print(f'Pace - {tds[8].text}')
        print(f'Event - {tds[9].text}')
        print(f'Next------------------------------------') # for clarity
    print(f'Iter - {i}, Number of elements per page - {len(trs)}') # we look at the iteration number and the amount of data collected
    wait.until(EC.element_to_be_clickable((By.ID, "xact_results_search_results_next")))
    driver.find_element(By.ID, "xact_results_search_results_next").click() # click the Next button

Output:

......
Bib - 5806
Name - Zulma Castaneda
Sex - F
Age - 32
City - Foothill Ranch, CA
Country - USA
Net - 5:36:38
Clock - 5:50:27
Pace - 12:50/mi
Event - Marathon
Next------------------------------------
Iter - 270, Number of elements per page - 100
Bib - 4226
Name - Zvi Donat
Sex - M
Age - 44
City - Natanya
Country - ISR
Net - 
Clock - 
Pace - 
Event - Marathon
Next------------------------------------
Bib - 55448
Name - Zvi Donat
Sex - M
Age - 44
City - Natanya
Country - 
Net - 
Clock - 
Pace - 
Event - 5K
Next------------------------------------
Iter - 271, Number of elements per page - 2
user510170
  • 286
  • 2
  • 5
0

Try the following to fetch all the tabular content spread across multiple pages:

from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import selenium.webdriver as webdriver


with webdriver.Chrome(service=Service(ChromeDriverManager().install())) as driver:
    driver.get("http://live.xacte.com/lamarathon")
    while True:
        for row in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//table[@id='xact_results_search_results']//tr[.//td]"))):
            data = [cell.text for cell in row.find_elements(By.CSS_SELECTOR,"td")]
            print(data)

        try:
            WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//a[@id='xact_results_search_results_next'][.='Next']"))).click()
            WebDriverWait(driver,20).until(EC.staleness_of(row))
        except Exception as err:
            break
SIM
  • 21,997
  • 5
  • 37
  • 109