-1

An error randomly occours in my python selenium project, where i scrape data from websites with my raspberry pi. It fetches date, temperature, wind and rainfall. The script sometimes run normally, but other times the error pops up:

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document (Session info: chrome=84.0.4147.141)

Are there any wrappers to implement to avoid such an error message? I would be happy if you could share a solution.

Full code:

from selenium import webdriver
import pandas as pd
from datetime import datetime
import time
import schedule

def job():
    driver = webdriver.Chrome()
    driver.get("https://pent.no/60.19401,11.09936")

    date = driver.find_elements_by_class_name("forecast-day-view-date-bar__date")
    i = 0

    for klikk in date:
            date[i].click()
            i = i+1
            if i==len(date):
                break
    time = driver.find_elements_by_class_name("forecast-hour-view-hour-label")
        
    count = len(time)-193

    temp = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__temperature")
    temp2 = temp[::2]
    temp3 = temp[1::2]

    wind = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__wind-speed")
    wind2 = wind[::2]
    wind3 = wind[1::2]

    rainfall = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__precipitation")
    rainfall2 = rainfall[::2]
    rainfall3 = rainfall[1::2]

    a = []
    b = []
    c = []
    d = []
    e = []
    f = []
    g = []
    h = []
    k = 0

    for datoer in date:
        print("Dato:"+datoer.text)
        a.append(datoer.text)
        if k==0:
            a.extend([""]*count) 
        else:
            a.extend([""]*23)
        k = k+1
        

    df1 = pd.DataFrame(a, columns= ["Date"])
        
    #
    for tider in time:
        print("Tid:"+tider.text)
        b.append(tider.text)
        
    df2 = pd.DataFrame(b, columns= ["Time"])
    #  
    for tempyr in temp2:
        print("Temp yr:"+tempyr.text)
        c.append(tempyr.text)
        
    df3 = pd.DataFrame(c, columns= ["Temp Yr"])

    for tempstorm in temp3:
        print("Temp storm:"+tempstorm.text)
        d.append(tempstorm.text)
        
    df4 = pd.DataFrame(d, columns= ["Temp Storm"])
    #   
    for windyr in wind2:
        print("Vind yr:"+windyr.text)
        e.append(windyr.text)
        
    df5 = pd.DataFrame(e, columns= ["Wind Yr"])

    for windstorm in wind3:
        print("Vind storm:"+windstorm.text)
        f.append(windstorm.text)
        
    df6 = pd.DataFrame(f, columns= ["Wind Storm"])
    #   
    for rainfallyr in rainfall2:
        g.append(rainfallyr.text)
        if rainfallyr.text == "":
            print("Rein yr:"+"0.0 mm")
        else:
            print("Rein yr:"+rainfallyr.text)
        
    df7 = pd.DataFrame(g, columns= ["Rainfall Yr"])
    df7 = df7.replace(r'^\s*$', "0.0 mm", regex=True)
      
    for rainfallstorm in rainfall3:
        h.append(rainfallstorm.text)
        if rainfallstorm.text == "":
            print("Rein storm:"+"0.0 mm")
        else:
            print("Rein storm:"+rainfallstorm.text)
        
    df8 = pd.DataFrame(h, columns= ["Rainfall Storm"])
    df8 = df8.replace(r'^\s*$', "0.0 mm", regex=True)
    #
    tabell = [df1, df2, df3, df4, df5, df6, df7, df8]
    result = pd.concat(tabell, axis=1)

    result.to_excel("weather" + str(int(datetime.now().day)) + ".xlsx")

            
    driver.quit()
    
schedule.every().day.at("00:00").do(job)
while 1:
    schedule.run_pending()
    time.sleep(60)

EDIT:

Traceback (most recent call last):
  File "/home/pi/Desktop/Data Scraper/test.py", line 108, in <module>
    schedule.run_pending()
  File "/home/pi/.local/lib/python3.7/site-packages/schedule/__init__.py", line 563, in run_pending
    default_scheduler.run_pending()
  File "/home/pi/.local/lib/python3.7/site-packages/schedule/__init__.py", line 94, in run_pending
    self._run_job(job)
  File "/home/pi/.local/lib/python3.7/site-packages/schedule/__init__.py", line 147, in _run_job
    ret = job.run()
  File "/home/pi/.local/lib/python3.7/site-packages/schedule/__init__.py", line 466, in run
    ret = self.job_func()
  File "/home/pi/Desktop/Data Scraper/test.py", line 47, in job
    a.append(datoer.text)
  File "/usr/local/lib/python3.7/dist-packages/selenium/webdriver/remote/webelement.py", line 76, in text
    return self._execute(Command.GET_ELEMENT_TEXT)['value']
  File "/usr/local/lib/python3.7/dist-packages/selenium/webdriver/remote/webelement.py", line 633, in _execute
    return self._parent.execute(command, params)
  File "/usr/local/lib/python3.7/dist-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/usr/local/lib/python3.7/dist-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=84.0.4147.141)
furas
  • 134,197
  • 12
  • 106
  • 148
William
  • 1
  • 2
  • If you get a stale element you have to reinitialize values when leaving the current url. If it's an a tag just collect all the hrefs and driver.get() to them. – Arundeep Chohan Feb 06 '21 at 18:08
  • today I saw the same problem in two questions. Maybe you should use `search` on Stackoverflow or Google to find answer for your problem. – furas Feb 06 '21 at 19:58
  • BTW: instead of `i = 0`, `date[i].click()`, `i = i+1` and `if i==len(date): break` you should learn to use simple `for klikk in date: klikk.click()` – furas Feb 06 '21 at 20:07
  • always put full error message (starting at word "Traceback") in question (not comment) as text (not screenshot). There are other useful infrmation. – furas Feb 06 '21 at 20:09
  • you didn't show full error message so we don't know which element makes problem. And don't expect that we will run code to see full error message. And maybe you need only `sleep()` so browser will have time to run all JavaScript code - before you use `click()`. – furas Feb 06 '21 at 20:11
  • @furas here is the full error message: https://justpaste.it/8f5rq – William Feb 07 '21 at 11:50
  • read my previous comment: `... in question (not comment) as text (not screenshot)`. It means also to put it as full text, not link to external portal. – furas Feb 07 '21 at 14:53
  • I added full error to you question - it shows problem with `a.append(datoer.text)` and it can means problem with `date`. As for me you should get all dates as text before you use `date[i].click()` because `click()` may move objects in browser's memory and then `date` is referring to non-existing objects in browser's memory. – furas Feb 07 '21 at 15:02
  • @furas changed date[i].click() to klikk.click() as you said, and it worked fine. But when i changed date to date.text before the loop click, Attributeerror 'list' object has no attribute 'text' pops up. – William Feb 07 '21 at 16:15
  • @arundeepchohan I am a programming newbie, and i am not sure what you mean. Can you provide your suggestion as a code instead, and where i should implement it? – William Feb 07 '21 at 16:21
  • `date` is list and you have to use `for`-loop to work with list - see `date_text = [item.text for item in date]` in my answer below – furas Feb 07 '21 at 17:01

1 Answers1

0

Selenium gives you reference to objects on current page in browser memory. When you click() or it runs Javascript code which adds elements then objects in browser memory change positions and references lead to wrong elements in browser memory - and this gives error stale element reference: element is not attached to the page document.

You would have to get date again after click().

Or you should get date as text before click()

date = driver.find_elements_by_class_name("forecast-day-view-date-bar__date")

# get all dates as text (before `click()`)
date_text = [item.text for item in date]

for item in date:
    item.click()

and later you should use this list

for k, text in enumerate(date_text):
    print("Dato:", text)
    a.append(text)
    if k == 0:
        a.extend([""]*count) 
    else:
        a.extend([""]*23)

EDIT:

My version with other changes - ie. I use less DataFrame.

I try to make some elements very similar to move it to function and make it even shorter.

On Linux I would use service cron instead Python module schedule

When code runs in some scheduler or cron then I doesn't have to display text so I would use some variable to stop displaying if display: print(...). And without displaying it should run faster.

from selenium import webdriver
import pandas as pd
from datetime import datetime
import time
import schedule

def job():
    driver = webdriver.Chrome()
    driver.get("https://pent.no/60.19401,11.09936")

    date = driver.find_elements_by_class_name("forecast-day-view-date-bar__date")

    # get all dates as text (before `click()`)
    date_text = [item.text for item in date]
    
    for item in date:
        item.click()

    time = driver.find_elements_by_class_name("forecast-hour-view-hour-label")
        
    count = len(time)-193

    temp = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__temperature")
    temp2 = temp[::2]
    temp3 = temp[1::2]

    wind = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__wind-speed")
    wind2 = wind[::2]
    wind3 = wind[1::2]

    rainfall = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__precipitation")
    rainfall2 = rainfall[::2]
    rainfall3 = rainfall[1::2]

    # --- dictionary for all columns ---
    
    all_columns = dict()
    
    # --- Date --- 

    rows = []

    for k, text in enumerate(date_text):
        print("Dato:", text)
        rows.append(text)
        if k == 0:
            rows.extend([""]*count) 
        else:
            rows.extend([""]*23)
  
    all_columns["Date"] = rows
    
    # --- Time ---

    rows = []
    
    for item in time:
        text = item.text.strip()
        print("Tid:", text)
        rows.append(text)

    all_columns["Time"] = rows

    # --- Temp Yr ---

    rows = []

    for item in temp2:
        text = item.text.strip()
        print("Temp yr:", text)
        rows.append(text)
        
    all_columns["Temp Yr"] = rows
        
    # --- Temp Storm --- 
    
    rows = []
    
    for item in temp3:
        text = item.text.strip()
        print("Temp storm:", text)
        rows.append(text)

    all_columns["Temp Storm"] = rows

    # --- Vind Yr --- 

    rows = []
    
    for item in wind2:
        text = item.text.strip()
        print("Vind yr:", text)
        rows.append(text)

    all_columns["Wind Yr"] = rows

    # --- Vind Storm --- 

    rows = []

    for item in wind3:
        text = item.text.strip()
        print("Vind storm:", text)
        rows.append(text)

    all_columns["Wind Storm"] = rows
 
    # --- Rainfall Yr --- 

    rows = []
    
    for item in rainfall2:
        text = item.text.strip()
        if text == "":
            text = "0.0 mm"
        print("Rein yr:", text)
        rows.append(text)

    all_columns["Rainfall Yr"] = rows
        
    # now I don't need to replace() empty string 

    # --- Rainfall Storm --- 
      
    rows = []
    
    for item in rainfall3:
        text = item.text.strip()
        if text == "":
            text = "0.0 mm"
        print("Rein storm:", text)
        rows.append(text)

    all_columns["Rainfall Storm"] = rows
        
    # now I don't need to replace() empty string 

    # --- --- 

    result = pd.DataFrame(all_columns)

    result.to_excel("weather{}.xlsx".format(datetime.now().day))
            
    driver.quit()
    
#schedule.every().day.at("00:00").do(job)
#while True:   # `True` instead of `1` is more readable, besides Python will run `while bool(1):`
#    schedule.run_pending()
#    time.sleep(60)

job()

EDIT:

Version with function

def get_rows(items, description=None, replace=None):
    rows = []
    
    for item in items:
        text = item.text.strip()
        
        if replace and text == "":
            text = replace
            
        rows.append(text)
        
        if DISPLAY and description:
            print(description, text)
    
    return rows

and now code is much shorter

from selenium import webdriver
import pandas as pd
from datetime import datetime
import time
import schedule

# --- constans --- (PEP8: UPPER_CASE_NAMES)

DISPLAY = True

# --- classes --- (PEP8: CamelCaseNames)

# empty

# --- functions --- (PEP8: lower_case_names)

def get_rows(items, description=None, replace=None):
    rows = []
    
    for item in items:
        text = item.text.strip()
        
        if replace and text == "":
            text = replace
            
        rows.append(text)
        
        if DISPLAY and description:
            print(description, text)
    
    return rows


def job():
    driver = webdriver.Chrome()
    driver.get("https://pent.no/60.19401,11.09936")

    date = driver.find_elements_by_class_name("forecast-day-view-date-bar__date")

    # get all dates as text (before `click()`)
    date_text = [item.text for item in date]
    
    for item in date:
        item.click()

    time = driver.find_elements_by_class_name("forecast-hour-view-hour-label")
        
    count = len(time)-193

    temp = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__temperature")
    temp2 = temp[::2]
    temp3 = temp[1::2]

    wind = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__wind-speed")
    wind2 = wind[::2]
    wind3 = wind[1::2]

    rainfall = driver.find_elements_by_class_name("forecast-hour-view-weather-widget__precipitation")
    rainfall2 = rainfall[::2]
    rainfall3 = rainfall[1::2]

    # - Date - 

    rows_date = []

    for k, text in enumerate(date_text):
        if DISPLAY:
            print("Dato:", text)
        rows_date.append(text)
        if k == 0:
            rows_date.extend([""]*count) 
        else:
            rows_date.extend([""]*23)

    # - other -
    
    result = pd.DataFrame({
                            "Date": rows_date,
                            "Time": get_rows(time, "Tid:"),
                            "Temp Yr": get_rows(temp2, "Temp yr:"),
                            "Temp Storm": get_rows(temp3, "Temp storm:"),
                            "Wind Yr": get_rows(wind2, "Vind yr:"),
                            "Wind Storm": get_rows(wind3, "Vind storm:"),
                            "Rainfall Yr": get_rows(rainfall2, "Rein yr:", "0.0 mm"),
                            "Rainfall Storm": get_rows(rainfall3, "Rein storm:", "0.0 mm"),
                         })

    # - save -
    
    result.to_excel("weather--{}.xlsx".format(datetime.now().day))
            
    driver.quit()
    
# --- main --- (PEP8: loser_case_names)
    
#schedule.every().day.at("00:00").do(job)
#while True:   # `True` instead of `1` is more readable, besides Python will run `while bool(1):`
#    schedule.run_pending()
#    time.sleep(60)

job()

PEP 8 -- Style Guide for Python Code

furas
  • 134,197
  • 12
  • 106
  • 148