I'm attempting to write a Python script that logs in to a website that runs JavaScript and scrape an element from the dashboard page. I'm using mechanize to login to the website and Requests-HTML to scape the data.
I can successfully login to the accounts page using mechanize. But I cannot pass the cookie data to Requests-HTML and continue the session to the dashboard page so I can scrape the data. I can't seem to format the data right to get the website (through Requests-HTML) to accept it.
I did get a version of this script running entirely with Selenium (the code is at the bottom), but I'd prefer to run a script that doesn't require a browser driver that opens a window.
from requests_html import HTMLSession
import mechanize
username = "me@example.com"
password = "12345678"
accts_url = "https://accounts.website.com"
dash_url = "https://dashboard.website.com"
browser = mechanize.Browser()
browser.open(accts_url)
browser.select_form(nr=0)
browser.form['email'] = username
browser.form['password'] = password
browser.submit()
response = browser.open(dash_url)
cookiejar_token = browser.cookiejar
print("mechanize, response:\n", response.read())
print("mechanize, browser.cookiejar:\n", cookiejar_token)
if str(cookiejar_token).startswith('<CookieJar['):
cookiejar_token_str_list = str(cookiejar_token).split(' ')
LBSERVERID_accts = cookiejar_token_str_list[1].lstrip('LBSERVERID=')
accounts_domain = cookiejar_token_str_list[3].rstrip('/>,')
session = cookiejar_token_str_list[5].lstrip('session=')
session_domain = cookiejar_token_str_list[7].rstrip('/>,')
LBSERVERID_dash = cookiejar_token_str_list[9].lstrip('LBSERVERID=')
dashboard_domain = cookiejar_token_str_list[11].rstrip('/>]>')
print("cookiejar_token_str_list:\n", cookiejar_token_str_list)
print("accounts 'LBSERVERID': %s for %s" % (LBSERVERID_accts, accounts_domain))
print("accounts 'session': %s for %s" % (session, session_domain))
print("dashboard 'LBSERVERID': %s for %s" % (LBSERVERID_dash, dashboard_domain))
else:
print("Incompatible token!\n")
# *****Requests_HTML does not communicate with mechanize!
session = HTMLSession()
print ("session.cookies:\n", session.cookies)
# I also made accounts_cookie_dict and session_cookie_dict
dash_cookie_dict = {
'name': 'LBSERVERID',
'value': LBSERVERID_dash,
'domain': dashboard_domain,
'path': '/'
}
# I attempt to manually create the correct cookie and assign it to dash_token, below
dash_token = browser.set_simple_cookie(dash_cookie_dict['name'], dash_cookie_dict['value'], dash_cookie_dict['domain'], dash_cookie_dict['path'])
print("dash_token:", dash_token)
print("cookiejar_token:", cookiejar_token)
print("dash_cookie_dict:\n", dash_cookie_dict)
# *****Attempting to pass the cookie to Requests-HTML below FAILS! :'(
response_obj = session.post(dash_url, cookies=dash_token)
print("response_obj:\n", response_obj)
print("response_obj.cookies from session.post:\n", response_obj.cookies)
response_obj.html.render(sleep=0.5)
print("requests_html, r.html.find('input'):\n", response_obj.html.find('input'))
Terminal Output:
mechanize, response:
b'<!doctype html><html lang="en"><head><script>!function(e***shortened by OP***</html>' ### Output in this field tells me the login by mechanize was successful
mechanize, browser.cookiejar:
<CookieJar[<Cookie LBSERVERID=3**************8 for accounts.example.com/>, <Cookie session=.e***shortened by OP***Y for accounts.example.com/>, <Cookie LBSERVERID=0**************a for dashboard.example.com/>]>
cookiejar_token_str_list:
['<CookieJar[<Cookie', 'LBSERVERID=3************8', 'for', 'accounts.example.com/>,', '<Cookie', 'session=.e***shortened by OP***Y', 'for', 'accounts.example.com/>,', '<Cookie', 'LBSERVERID=0**************a', 'for', 'dashboard.example.com/>]>']
accounts 'LBSERVERID': 3************8 for accounts.example.com
accounts 'session': .e***shortened by OP***Y for accounts.example.com
dashboard 'LBSERVERID': 0**************a for dashboard.example.com
session.cookies:
<RequestsCookieJar[]>
dash_token: None
cookiejar_token: <CookieJar[<Cookie LBSERVERID=3************8 for accounts.example.com/>, <Cookie session=.e***shortened by OP***Y for accounts.example.com/>, <Cookie LBSERVERID=0**************a for dashboard.example.com/>]>
dash_cookie_dict:
{'name': 'LBSERVERID', 'value': '0**************a', 'domain': 'dashboard.example.com', 'path': '/'}
response_obj:
<Response [403]> ### Access denied and it issues a new cookie below
response_obj.cookies from session.post:
<RequestsCookieJar[<Cookie LBSERVERID=a**************3 for dashboard.example.com/>]>
requests_html, r.html.find('input'): ### The output below tells me I'm back on the login page
[<Element 'input' class=('form-control',) id='email' name='email' required='' type='text' value=''>, <Element 'input' class=('form-control',) id='password' name='password' required='' type='password' value=''>, <Element 'input' id='csrf_token' name='csrf_token' type='hidden' value='I***shortened by OP***Y'>]
My Selenium code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
login_post_url = "https://accounts.example.com"
internal_url = "https://dashboard.example.com"
username = "user@email.com"
password = "12345678"
driver = webdriver.Safari(executable_path='/usr/bin/safaridriver') # initialize the Safari driver for Mac
driver.get(login_post_url) # head to login page
driver.find_element("id", "email").send_keys(username)
driver.find_element("id", "password").send_keys(password)
driver.find_element("id", "submit_form").click()
WebDriverWait(driver=driver, timeout=10).until( # wait the ready state to be complete
lambda x: x.execute_script("return document.readyState === 'complete'"))
error_message = "Incorrect username or password."
errors = driver.find_elements(By.CLASS_NAME, "flash-error") # get the errors (if there are)
# print the errors optionally
# for e in errors:
# print(e.text)
if any(error_message in e.text for e in errors): # if we find that error message within errors, then login is failed
print("[!] Login failed")
else:
print("[+] Login successful")
time.sleep(5)
driver.get(internal_url)
time.sleep(5)
element = driver.find_element(By.XPATH, '/html/........./div/p')
scraped_variable = element.get_attribute('innerHTML')
print("scraped_variable:", scraped_variable)