I am doing web scraping using selenium
in python with the following code:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions
def get_all_search_details(URL):
SEARCH_RESULTS = {}
options = Options()
options.headless = True
options.add_argument("--remote-debugging-port=9222") #
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-extensions")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(URL)
print(f"Scraping {driver.current_url}")
try:
medias = WebDriverWait(driver,timeout=5,).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-row')))
except exceptions.StaleElementReferenceException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.NoSuchElementException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.TimeoutException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.WebDriverException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except exceptions.SessionNotCreatedException as e:
print(f">> {type(e).__name__}: {e.args}")
return
except Exception as e:
print(f">> {type(e).__name__} line {e.__traceback__.tb_lineno} of {__file__}: {e.args}")
return
except:
print(f">> General Exception: {URL}")
return
for media_idx, media_elem in enumerate(medias):
outer_html = media_elem.get_attribute('outerHTML')
result = scrap_newspaper(outer_html) # some external functions
SEARCH_RESULTS[f"result_{media_idx}"] = result
return SEARCH_RESULTS
if __name__ == '__main__':
in_url = "https://digi.kansalliskirjasto.fi/clippings?query=isokyr%C3%B6&categoryId=12&orderBy=RELEVANCE&page=3&resultMode=THUMB"
my_res = get_all_search_details(in_url)
I applied several try except
mentioned in documentation to ensure I would not get trapped in selenium exceptions, however, here is the error I obtained:
Traceback (most recent call last):
File "nationalbiblioteket_logs.py", line 277, in <module>
run()
File "nationalbiblioteket_logs.py", line 264, in run
all_queries(file_=get_query_log(QUERY=args.query),
File "nationalbiblioteket_logs.py", line 219, in all_queries
df = pd.DataFrame( df.apply( check_urls, axis=1, ) )
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/frame.py", line 8740, in apply
return op.apply()
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/apply.py", line 688, in apply
return self.apply_standard()
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/apply.py", line 812, in apply_standard
results, res_index = self.apply_series_generator()
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/pandas/core/apply.py", line 828, in apply_series_generator
results[i] = self.f(v)
File "nationalbiblioteket_logs.py", line 218, in <lambda>
check_urls = lambda INPUT_DF: analyze_(INPUT_DF)
File "nationalbiblioteket_logs.py", line 201, in analyze_
df["search_results"] = get_all_search_details(in_url)
File "/home/xenial/WS_Farid/DARIAH-FI/url_scraping.py", line 68, in get_all_search_details
outer_html = media_elem.get_attribute('outerHTML')
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/webelement.py", line 174, in get_attribute
self, name)
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 494, in execute_script
'args': converted_args})['value']
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 429, in execute
self.error_handler.check_response(response)
File "/home/xenial/anaconda3/envs/py37/lib/python3.7/site-packages/selenium/webdriver/remote/errorhandler.py", line 243, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: headless chrome=110.0.5481.30)
What am I doing wrong in my python script which causes such exception? I want to return None
and get out of function in case such exception occurs.
Here are some more details regarding libraries I use:
>>> selenium.__version__
'4.5.0'
>>> webdriver_manager.__version__
'3.8.4'