I have deployed a selenium scraper in scrapinghub cloud using a custom docker container. When running the script I am getting the following error from selenium and I am not sure what is causing this issue when I deploy but not in my local enviorment.
File "link_spider.py", line 128, in main
url_crawler.setUp()
File "link_spider.py", line 49, in setUp
self.driver = webdriver.Chrome(desired_capabilities=capabilities, options=chrome_options)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 81, in __init__
desired_capabilities=desired_capabilities)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 157, in __init__
self.start_session(capabilities, browser_profile)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 252, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
When I run i virtualenv on my local environment which is ubuntu 18.04 I do not have any issue at all. Below I will paste my setUp method from my class.
def setUp(self):
headless_proxy = "127.0.0.1:3128"
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': headless_proxy,
'ftpProxy' : headless_proxy,
'sslProxy' : headless_proxy,
'noProxy' : ''
})
# path to download files
path = '/tmp'
# set the download file to tmp directory
prefs = {
"download.default_directory": path,
"profile.managed_default_content_settings.images": 2,
}
chrome_options = webdriver.ChromeOptions()
# change the location of the file download
chrome_options.add_experimental_option("prefs", prefs)
# turns off webdriver detection
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
# disable chrome automation notification
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--headless")
capabilities = dict(DesiredCapabilities.CHROME)
proxy.add_to_capabilities(capabilities)
self.driver = webdriver.Chrome(desired_capabilities=capabilities, options=chrome_options)
self.driver.set_page_load_timeout(120)
Docker file
FROM scrapinghub/scrapinghub-stack-scrapy:1.6-py3
RUN apt-get -y --no-install-recommends install zip unzip jq libxml2 libxml2-dev
RUN printf "deb http://archive.debian.org/debian/ jessie main\ndeb-src http://archive.debian.org/debian/ jessie main\ndeb http://security.debian.org jessie/updates main\ndeb-src http://security.debian.org jessie/updates main" > /etc/apt/sources.list
#============================================
# Google Chrome
#============================================
# can specify versions by CHROME_VERSION;
# e.g. google-chrome-stable=53.0.2785.101-1
# google-chrome-beta=53.0.2785.92-1
# google-chrome-unstable=54.0.2840.14-1
# latest (equivalent to google-chrome-stable)
# google-chrome-beta (pull latest beta)
#============================================
ARG CHROME_VERSION="google-chrome-stable"
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
&& apt-get update -qqy \
&& apt-get -qqy install \
${CHROME_VERSION:-google-chrome-stable} \
&& rm /etc/apt/sources.list.d/google-chrome.list \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/*
#============================================
# Chrome Webdriver
#============================================
# can specify versions by CHROME_DRIVER_VERSION
# Latest released version will be used by default
#============================================
ARG CHROME_DRIVER_VERSION
RUN CHROME_STRING=$(google-chrome --version) \
&& CHROME_VERSION_STRING=$(echo "${CHROME_STRING}" | grep -oP "\d+\.\d+\.\d+\.\d+") \
&& CHROME_MAYOR_VERSION=$(echo "${CHROME_VERSION_STRING%%.*}") \
&& wget --no-verbose -O /tmp/LATEST_RELEASE "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_MAYOR_VERSION}" \
&& CD_VERSION=$(cat "/tmp/LATEST_RELEASE") \
&& rm /tmp/LATEST_RELEASE \
&& if [ -z "$CHROME_DRIVER_VERSION" ]; \
then CHROME_DRIVER_VERSION="${CD_VERSION}"; \
fi \
&& CD_VERSION=$(echo $CHROME_DRIVER_VERSION) \
&& echo "Using chromedriver version: "$CD_VERSION \
&& wget --no-verbose -O /tmp/chromedriver_linux64.zip https://chromedriver.storage.googleapis.com/$CD_VERSION/chromedriver_linux64.zip \
&& rm -rf /opt/selenium/chromedriver \
&& unzip /tmp/chromedriver_linux64.zip -d /opt/selenium \
&& rm /tmp/chromedriver_linux64.zip \
&& mv /opt/selenium/chromedriver /opt/selenium/chromedriver-$CD_VERSION \
&& chmod 755 /opt/selenium/chromedriver-$CD_VERSION \
&& sudo ln -fs /opt/selenium/chromedriver-$CD_VERSION /usr/bin/chromedriver
#============================================
# crawlera-headless-proxy
#============================================
RUN curl -L https://github.com/scrapinghub/crawlera-headless-proxy/releases/download/1.1.1/crawlera-headless-proxy-linux-amd64 -o /usr/local/bin/crawlera-headless-proxy \
&& chmod +x /usr/local/bin/crawlera-headless-proxy
COPY ./start-crawl /usr/local/bin/start-crawl
ENV TERM xterm
ENV SCRAPY_SETTINGS_MODULE cars.settings
RUN pip install --upgrade pip
RUN pip install --upgrade setuptools
RUN mkdir -p /app
WORKDIR /app
COPY ./requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY . /app
RUN python setup.py install
Any help would be greatly appriciated!