I just started using selenium for my own scraping, I'm setting up my proxy using Oxylabs but there is some cache that is avoiding the browser to get a new IP for every tab or connection it opens. My proxy url is using rotating-per-connection so it should be fine. The thing is, when I try opening many tabs very quickly, let's say 15, the first 10 will have different IPs (I'm using ipinfo.io to test) and then the latest 5 have the same IP and when I open them slowly, all 15 tabs will have the same IP, that's why I say it must be some kind of cache, although I don't know about this and could be something else. Here's my code:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import asyncio
from concurrent.futures.thread import ThreadPoolExecutor
from seleniumwire import webdriver
from threading import Thread
from threading import Lock
import time
# set proxy options
max_pages = 15
executor = ThreadPoolExecutor(max_pages)
loop = asyncio.get_event_loop()
lock = Lock()
def scraper(driver, url, index):
# driver.get(url)
lock.acquire()
driver.execute_script("window.open('" + url + "', '_blank');")
driver.switch_to.window(driver.window_handles[len(driver.window_handles) - 1])
driver.delete_all_cookies()
print("Started page " + str(index))
lock.release()
i = 1
# set profile options
proxy_profile = {
"proxy": {
"http": "http://customer-myuser-cc-es:mypass@pr.oxylabs.io:7777",
"https": "https://customer-myuser-cc-es:mypass@pr.oxylabs.io:7777",
}
}
options = Options()
# options.add_argument("--headless")
options.set_preference("browser.cache.disk.enable", False)
options.set_preference("browser.cache.memory.enable", False)
options.set_preference("browser.cache.offline.enable", False)
options.set_preference("network.http.use-cache", False)
options.set_preference("network.dnsCacheExpiration", 0)
options.set_preference("network.dnsCacheEntries", 0)
# create a new instance of the Firefox driver
driver = webdriver.Firefox(options=options, seleniumwire_options=proxy_profile)
while i < max_pages:
loop.run_in_executor(executor, scraper, driver, "https://ipinfo.io", i)
i += 1
input("Press Enter to continue...")
driver.quit()
As you can see I have some options there that were supposed to avoid this but doesn't seem to work. My goal is to have a different IP for each tab or connection I establish with the browser (including refreshes in the same page/tab). Any insight is appreciated. Thanks.