Hi how are you doing? I am new to Selenium, I am working in colab (Python) with a Google Chrome version 111.0.5563.64 which is the same for chrome driver. The code works fine to scrape a few tweets but I have to collect 50000 in each run two errors appear (sometimes one and sometimes another). An error is the following: in the line, data = get_tweet_data(tweet), the error is StaleElementReferenceException: Message: stale element reference: element is not attached to the page document or the most common in the line curr_position = driver.execute_script(' return window.pageYOffset;') the error is WebDriverException: Message: unknown error: session deleted because of page crash
Code:
``
`from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from time import sleep
import pandas as pd
import csv
import datetime
import openpyxl
# Inicializar el driver de Chrome
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', chrome_options=options)
hashtag = '#checopérez'
limit = 70000
tweets = []
tweet_ids = set()
last_position = driver.execute_script('return window.pageYOffset;')
scrolling = True
def get_tweet_data(tweet):
"""Extract data from tweet data"""
try:
tweet_date = tweet.find_element('xpath', './/time').get_attribute('datetime')
tweet_text = tweet.find_element('xpath', '//*[@data-testid="tweetText"]/span[1]').text
except NoSuchElementException:
return
data_tweet = [tweet_text, tweet_date]
return data_tweet
# Navigate to the Twitter trending page
driver.get("https://twitter.com/explore/tabs/trending")
topic = driver.find_element('xpath','//input[@data-testid="SearchBox_Search_Input"]')
topic = driver.find_element('xpath','//input[@data-testid="SearchBox_Search_Input"]')
topic.send_keys(Keys.RETURN)
# Hacer click en Latest
driver.find_element('xpath', '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[1]/div[1]/div[2]/nav/div/div[2]/div/div[2]/a').click()
while scrolling:
page_tweets = driver.find_elements('xpath', '//article[@data-testid="tweet"]')
sleep(2)
for tweet in page_tweets:
WebElement.page_tweets = driver.find_elements('xpath', '//article[@data-testid="tweet"]')
data = get_tweet_data(tweet)
if data:
tweet_id = ''.join(data)
if tweet_id not in tweet_ids:
tweet_ids.add(tweet_id)
tweets.append({'topic': hashtag, 'text': data[0], 'date': data[1]})
# Si ya se han recolectado 50,000 tweets, salir del bucle
if len(tweets) >= limit:
break
# Si ya se han recolectado 50,000 tweets, salir del bucle
if len(tweets) >= limit:
break
scroll_attempt = 0
while True:
# Check scroll position
sleep(1)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(1)
curr_position = driver.execute_script('return window.pageYOffset;')
if last_position == curr_position:
scroll_attempt += 1
# End of scroll region
if scroll_attempt >= 3:
scrolling = False
break
else:
sleep(2) # Attempt to scroll again
else:
last_position = curr_position
break `
``
I want to extract a good amount in a single run (about 70000 tweets why extra repeated u need 50000 unique tweets)