0

I'm attempting to write a Python script that logs in to a website that runs JavaScript and scrape an element from the dashboard page. I'm using mechanize to login to the website and Requests-HTML to scape the data.

I can successfully login to the accounts page using mechanize. But I cannot pass the cookie data to Requests-HTML and continue the session to the dashboard page so I can scrape the data. I can't seem to format the data right to get the website (through Requests-HTML) to accept it.

I did get a version of this script running entirely with Selenium (the code is at the bottom), but I'd prefer to run a script that doesn't require a browser driver that opens a window.

from requests_html import HTMLSession
import mechanize

username = "me@example.com"
password = "12345678"
accts_url = "https://accounts.website.com"
dash_url = "https://dashboard.website.com"

browser = mechanize.Browser()
browser.open(accts_url)
browser.select_form(nr=0)
browser.form['email'] = username
browser.form['password'] = password
browser.submit()
response = browser.open(dash_url)
cookiejar_token = browser.cookiejar
print("mechanize, response:\n", response.read())
print("mechanize, browser.cookiejar:\n", cookiejar_token)

if str(cookiejar_token).startswith('<CookieJar['):
    cookiejar_token_str_list = str(cookiejar_token).split(' ') 
    LBSERVERID_accts = cookiejar_token_str_list[1].lstrip('LBSERVERID=')
    accounts_domain = cookiejar_token_str_list[3].rstrip('/>,')
    session = cookiejar_token_str_list[5].lstrip('session=')
    session_domain = cookiejar_token_str_list[7].rstrip('/>,')
    LBSERVERID_dash = cookiejar_token_str_list[9].lstrip('LBSERVERID=')
    dashboard_domain = cookiejar_token_str_list[11].rstrip('/>]>')
    print("cookiejar_token_str_list:\n", cookiejar_token_str_list)
    print("accounts 'LBSERVERID': %s for %s" % (LBSERVERID_accts, accounts_domain))
    print("accounts 'session': %s for %s" % (session, session_domain))
    print("dashboard 'LBSERVERID': %s for %s" % (LBSERVERID_dash, dashboard_domain))
else:
    print("Incompatible token!\n")

# *****Requests_HTML does not communicate with mechanize!
session = HTMLSession()
print ("session.cookies:\n", session.cookies)

# I also made accounts_cookie_dict and session_cookie_dict    
dash_cookie_dict = {
    'name': 'LBSERVERID', 
    'value': LBSERVERID_dash, 
    'domain': dashboard_domain,
    'path': '/'
} 
# I attempt to manually create the correct cookie and assign it to dash_token, below
dash_token = browser.set_simple_cookie(dash_cookie_dict['name'], dash_cookie_dict['value'], dash_cookie_dict['domain'], dash_cookie_dict['path'])
print("dash_token:", dash_token)
print("cookiejar_token:", cookiejar_token)
print("dash_cookie_dict:\n", dash_cookie_dict)
# *****Attempting to pass the cookie to Requests-HTML below FAILS! :'(
response_obj = session.post(dash_url, cookies=dash_token)
print("response_obj:\n", response_obj)
print("response_obj.cookies from session.post:\n", response_obj.cookies)
response_obj.html.render(sleep=0.5)
print("requests_html, r.html.find('input'):\n", response_obj.html.find('input'))

Terminal Output:

mechanize, response:
 b'<!doctype html><html lang="en"><head><script>!function(e***shortened by OP***</html>'  ### Output in this field tells me the login by mechanize was successful
mechanize, browser.cookiejar:
 <CookieJar[<Cookie LBSERVERID=3**************8 for accounts.example.com/>, <Cookie session=.e***shortened by OP***Y for accounts.example.com/>, <Cookie LBSERVERID=0**************a for dashboard.example.com/>]>
cookiejar_token_str_list:
 ['<CookieJar[<Cookie', 'LBSERVERID=3************8', 'for', 'accounts.example.com/>,', '<Cookie', 'session=.e***shortened by OP***Y', 'for', 'accounts.example.com/>,', '<Cookie', 'LBSERVERID=0**************a', 'for', 'dashboard.example.com/>]>']
accounts 'LBSERVERID': 3************8 for accounts.example.com
accounts 'session': .e***shortened by OP***Y for accounts.example.com
dashboard 'LBSERVERID': 0**************a for dashboard.example.com
session.cookies:
 <RequestsCookieJar[]>
dash_token: None
cookiejar_token: <CookieJar[<Cookie LBSERVERID=3************8 for accounts.example.com/>, <Cookie session=.e***shortened by OP***Y for accounts.example.com/>, <Cookie LBSERVERID=0**************a for dashboard.example.com/>]>
dash_cookie_dict:
 {'name': 'LBSERVERID', 'value': '0**************a', 'domain': 'dashboard.example.com', 'path': '/'}
response_obj:
 <Response [403]>  ### Access denied and it issues a new cookie below
response_obj.cookies from session.post:
 <RequestsCookieJar[<Cookie LBSERVERID=a**************3 for dashboard.example.com/>]>
requests_html, r.html.find('input'):  ### The output below tells me I'm back on the login page
 [<Element 'input' class=('form-control',) id='email' name='email' required='' type='text' value=''>, <Element 'input' class=('form-control',) id='password' name='password' required='' type='password' value=''>, <Element 'input' id='csrf_token' name='csrf_token' type='hidden' value='I***shortened by OP***Y'>]

My Selenium code:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time

login_post_url = "https://accounts.example.com"
internal_url = "https://dashboard.example.com"
username = "user@email.com"
password = "12345678"

driver = webdriver.Safari(executable_path='/usr/bin/safaridriver') # initialize the Safari driver for Mac
driver.get(login_post_url) # head to login page
driver.find_element("id", "email").send_keys(username)
driver.find_element("id", "password").send_keys(password)
driver.find_element("id", "submit_form").click()

WebDriverWait(driver=driver, timeout=10).until( # wait the ready state to be complete
    lambda x: x.execute_script("return document.readyState === 'complete'"))
error_message = "Incorrect username or password."
errors = driver.find_elements(By.CLASS_NAME, "flash-error") # get the errors (if there are)
# print the errors optionally
# for e in errors:
#     print(e.text)
if any(error_message in e.text for e in errors): # if we find that error message within errors, then login is failed
    print("[!] Login failed")
else:
    print("[+] Login successful")

time.sleep(5)
driver.get(internal_url)
time.sleep(5)

element = driver.find_element(By.XPATH, '/html/........./div/p')
scraped_variable = element.get_attribute('innerHTML')
print("scraped_variable:", scraped_variable)
Jeremy
  • 1
  • 2

0 Answers0