I'm trying to scrape a page with Selenium, Selenium-wire (for auth proxies) and random-user-agent.
The script runs normally, but after a few lines of searching (around 2 or 3) I get the title error. I searched a lot, and saw that a possible solution would be put a sleep right before driver.get()
, however, that didn't help. I added this amount of try & except
to try to understand what is happening, but this error does not fall on any of the excepts
from colorama import Fore, Back, Style, init
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from multiprocessing import freeze_support
from time import sleep
import os
from seleniumwire import webdriver
import numpy as np
from threading import Thread
import threading
import webbrowser
import sys
from sys import exit
import json
from fake_useragent import UserAgent
# clear and disable debug mode
os.environ["WDM_LOG_LEVEL"] = "0"
# colorama startup
init(autoreset=True)
# chromedriver startup
WEBDRIVER_SVC = ChromeService(ChromeDriverManager().install())
# driver
def spawn_driver(line: str):
while True:
try:
try:
ua = UserAgent()
user_agent = ua.chrome
CHROME_OPTIONS = Options()
CHROME_OPTIONS.add_argument(f"user-agent={user_agent}")
CHROME_OPTIONS.add_argument("--headless")
CHROME_OPTIONS.add_argument("--disable-gpu")
CHROME_OPTIONS.add_argument("--window-size=1920,1080")
CHROME_OPTIONS.add_argument("--disable-extensions")
CHROME_OPTIONS.add_argument("--start-maximized")
CHROME_OPTIONS.add_experimental_option(
"excludeSwitches", ["enable-logging"]
)
WIRE_OPTIONS = {
"proxy": {
"http": f"http://myuser:myhost@domainproxy.com",
"https": f"https://myuser:myhost@domainproxy.com",
}
}
driver = webdriver.Chrome(
seleniumwire_options=WIRE_OPTIONS,
service=WEBDRIVER_SVC,
options=CHROME_OPTIONS,
)
sleep(30)
driver.get("https://www.google.com/")
sleep(3)
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#search"))
).click()
# search for every line on the filename.txt
sleep(10)
except Exception as e:
driver.close()
continue
try:
not_found = WebDriverWait(driver, 5).until(
EC.presence_of_element_located(
(By.XPATH, '//*[@id="alertError"]/div/div[1]')
)
)
print(not_found.text)
driver.close()
break
except Exception as e:
try:
# validation point
found_msg = WebDriverWait(driver, 30).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#lnk-accountRecovery > span")
)
)
print(found_msg.text)
break
except Exception as E:
try:
blocked_msg = WebDriverWait(driver, 30).until(
EC.presence_of_element_located(
(
By.CSS_SELECTOR,
"#__next > div > form > div.sc-fFTYTi.hNKDnZ > div.sc-gQNndl.bcmdmD > div.sc-jOBXIr.ivZFJS > div.sc-kIWQTW.gVwKbm > h5",
)
)
)
print()
continue
except:
continue
finally:
driver.close()
except:
driver.close()
continue
def parse_list(unique_list):
for line in unique_list:
spawn_driver(line)
def parse_file(filename: str, threads: int):
with open(filename, "r") as f:
lines = f.readlines()
splited_file = np.array_split(lines, int(threads))
for unique_list in splited_file:
try:
t = Thread(target=parse_list, args=(unique_list,))
t.start()
except:
sleep(1)
def start():
filename = "myfile.txt"
threads = 5
output = "results.txt"
# execute script
parse_file(filename, threads, output)
def main():
start()
if __name__ == "__main__":
freeze_support()
main()