Python Selenium Scraping for LinkedIn in Google Cloud

Question

I am trying to make a scraper for LinkedIn with Selenium in Python.

My goal is to automate the process in google cloud so every day it connects to my profile and scrape the contacts.

The problem I got is that, when I connect from an IP public address different from my home IP public address I have to pass the security LinkedIn check. This is because I am conecting from Google Cloud Run so the IP public address asign is different from my local IP public address

I have tried everything from this post https://copyprogramming.com/howto/how-to-bypass-linkedin-s-security-check-for-web-scraping-with-python-and-selenium?utm_content=cmp-true (in the chrome options I also include --headless, otherwise it does not work on cloud) and the 'li_at' cookie seems to work but the value of this changes every time I start a session. With this I mean that I can get the value of this cookie in my local computer, and pass it to the code in cloud but if I log out from my local computer this cookie value changes. Also, it is not my goal to pass the cookie value every time I have to log in a session, because it will not bee as automate as I want.

I think is not a code issue because I can run it in a docker container in my local computer, but I will leave you the python code

I am looking for a way to pin the li_at so it does not change in every session, to link my local IP public address to Google Cloud or any other way to make this work

Python code


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# from selenium_stealth import stealth

from google.cloud import storage


import itertools
import pandas as pd
import re
import time
import os


class ContactosYaCreados:
    def __init__(self):
        self.url = "https://www.linkedin.com/login"
        self.email = os.environ["EMAIL"]
        self.contraseña = os.environ["PASSWORD"]

    def scrap_contacts(self):
        self.driver_initilizer()
        self.inicio_sesion()
        self.get_contacts()
        self.driver.quit()

    def driver_initilizer(self):

        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("start-maximized")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)

        self.driver = webdriver.Chrome(options=options)

        # stealth(
        #     self.driver,
        #     languages=["en-US", "en"],
        #     vendor="Google Inc.",
        #     platform="Win32",
        #     webgl_vendor="Intel Inc.",
        #     renderer="Intel Iris OpenGL Engine",
        # )

        self.driver.get(self.url)

        cookie = {"name": "li_at", "value": os.environ["li_at"]}

        self.driver.add_cookie(cookie)

        self.actions = ActionChains(self.driver)
        self.driver.maximize_window()
        self.driver.implicitly_wait(5)

    def inicio_sesion(self):
  
        time.sleep(2)

        search = self.driver.find_element(By.XPATH, "//input[@id='username']")
        search.send_keys(self.email)

        time.sleep(2)

        search = self.driver.find_element(By.XPATH, "//input[@id='password']")
        search.send_keys(self.contraseña)

        time.sleep(2)
        # search.send_keys(Keys.RETURN)

        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (
                    By.XPATH,
                    "//button[text()='Sign in']",
                )
            )
        ).click()

    def get_contacts(self):
        time.sleep(10)
        print(
            "body pagina home:", self.driver.find_element(By.XPATH, "/html/body").text
        )

        self.driver.get(
            "https://www.linkedin.com/mynetwork/invite-connect/connections/"
        )

        self.contactos_linkedin = pd.read_csv(
            "gs://linkedin_contactos/contactos_linkedin.csv", sep=",", encoding="utf-8"
        )

        self.nombre = self.contactos_linkedin["Nombre completo"].iloc[0]

        print(self.nombre)
        time.sleep(10)
        print(
            "body pagina conexion:",
            self.driver.find_element(By.XPATH, "/html/body").text,
        )
        self.scroll_end()
        time.sleep(3)
        self.get_elementos()

    def scroll_end(self):
        time.sleep(2)
        self.driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")

    def get_elementos(self):
        self.get_nombres()
        self.get_descripcion()

    def get_nombres(self):
        path = "//span[@class='mn-connection-card__name t-16 t-black t-bold']"
        self.nombres = self.get_text_element(path)
        print(self.nombres)
        self.nombres = list(itertools.takewhile(self.match_name, self.nombres))

    def match_name(self, nombre):
        return nombre != self.nombre

    def get_descripcion(self):
        path = "//span[@class='mn-connection-card__occupation t-14 t-black--light t-normal']"
        self.descripcion = self.get_text_element(path)
        self.descripcion = self.descripcion[: len(self.nombres)]

    def get_text_element(self, path):
        get_element = self.driver.find_elements(By.XPATH, path)
        elementos = [elemento.text for elemento in get_element]

        return elementos

Python Selenium Scraping for LinkedIn in Google Cloud

0 Answers0