Scrape Tweets with Selenium in Colab

Question

Hi how are you doing? I am new to Selenium, I am working in colab (Python) with a Google Chrome version 111.0.5563.64 which is the same for chrome driver. The code works fine to scrape a few tweets but I have to collect 50000 in each run two errors appear (sometimes one and sometimes another). An error is the following: in the line, data = get_tweet_data(tweet), the error is StaleElementReferenceException: Message: stale element reference: element is not attached to the page document or the most common in the line curr_position = driver.execute_script(' return window.pageYOffset;') the error is WebDriverException: Message: unknown error: session deleted because of page crash

Code:

``
`from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from time import sleep
import pandas as pd
import csv
import datetime
import openpyxl

# Inicializar el driver de Chrome
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', chrome_options=options)

hashtag = '#checopérez'
limit = 70000
tweets = []
tweet_ids = set()
last_position = driver.execute_script('return window.pageYOffset;')
scrolling = True

def get_tweet_data(tweet):
  """Extract data from tweet data"""
  try:
    tweet_date = tweet.find_element('xpath', './/time').get_attribute('datetime')
    tweet_text = tweet.find_element('xpath', '//*[@data-testid="tweetText"]/span[1]').text
  except NoSuchElementException:
    return

  data_tweet = [tweet_text, tweet_date] 

  return data_tweet
  
# Navigate to the Twitter trending page
driver.get("https://twitter.com/explore/tabs/trending")

topic = driver.find_element('xpath','//input[@data-testid="SearchBox_Search_Input"]')
topic = driver.find_element('xpath','//input[@data-testid="SearchBox_Search_Input"]')
topic.send_keys(Keys.RETURN)

# Hacer click en Latest 
driver.find_element('xpath', '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[1]/div[1]/div[2]/nav/div/div[2]/div/div[2]/a').click()

while scrolling:
  page_tweets = driver.find_elements('xpath', '//article[@data-testid="tweet"]')
  sleep(2)
  for tweet in page_tweets:
    WebElement.page_tweets = driver.find_elements('xpath', '//article[@data-testid="tweet"]')
    data = get_tweet_data(tweet)
    if data:
      tweet_id = ''.join(data)
      if tweet_id not in tweet_ids:
        tweet_ids.add(tweet_id)
        tweets.append({'topic': hashtag, 'text': data[0], 'date': data[1]})

        # Si ya se han recolectado 50,000 tweets, salir del bucle
        if len(tweets) >= limit:
          break
  
  # Si ya se han recolectado 50,000 tweets, salir del bucle
  if len(tweets) >= limit:
    break

  scroll_attempt = 0
  while True:
    # Check scroll position
    sleep(1)
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    sleep(1)
    curr_position = driver.execute_script('return window.pageYOffset;')
    if last_position == curr_position:
      scroll_attempt += 1 

      # End of scroll region
      if scroll_attempt >= 3:
        scrolling = False
        break
      else:
        sleep(2) # Attempt to scroll again
    else:
      last_position = curr_position
      break `
``

I want to extract a good amount in a single run (about 70000 tweets why extra repeated u need 50000 unique tweets)

Does this answer your question? [Twitter scraping using Python](https://stackoverflow.com/questions/67137294/twitter-scraping-using-python) — Jorge Luis, Mar 21 '23 at 15:31

Scrape Tweets with Selenium in Colab

0 Answers0