How to navigate through multiple pages of a website python, the url doesn't change when changing pages and also doesn't have a NEXT button. Using selenium and python.
I already tried with Select from selenium, and it didn't work. My URL is dynamic, and need to many clicks:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import Select
import pandas as pd
import re
import math
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
#SET UP DRIVER
driver = webdriver.Firefox()
extension_path = "/home/work04/.mozilla/firefox/vyrx65nr.default-release/extensions/{e58d3966-3d76-4cd9-8552-1582fbc800c1}.xpi"
driver.install_addon(extension_path, temporary=True)
driver.maximize_window()
driver.get("URL WEBSITE")
sleep(4)
#CLICKS
driver.find_element(By.XPATH, '//*[@id="page"]/div[4]/div[2]/button').click() #Cookie
sleep(2)
driver.find_element(By.NAME, 'tipoSituacao').click()
sleep(1)
driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/section[2]/div/div/div/article/div[2]/div/div/form/div/div[2]/div[3]/div/select/option[2]').click()
sleep(1)
driver.find_element(By.NAME, 'situacao').click()
sleep(1)
driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/section[2]/div/div/div/article/div[2]/div/div/form/div/div[2]/div[4]/div/select/option[1]').click()
sleep(1)
driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/section[2]/div/div/div/article/div[2]/div/div/form/div/div[4]/div[2]/button').click()
sleep(2)
#WAIT
element = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/section[2]/div/div/div/div[2]/div[1]/div/div/div[2]/div')))
assert element.is_displayed()
#OBJECT HTML
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
sleep(10)
data_list = []
#Data in DR
dr = soup.find_all(class_=re.compile('CLASS NAME I WANT'))
#DATA
for x in range(len(dr)):
data = {}
data['xxx'] = x.find('h4').get_text()
col_md_4_divs = x.find_all(class_='col-md-4')
data['xxx'] = col_md_4_divs[0].b.next_sibling.strip()
# data['xx'] = col_md_4_divs[0].find(string=re.compile('([A-Z]{2})')).get_text().strip()
data['xxxxxx'] = col_md_4_divs[3].b.next_sibling.strip()
data['xxxxxx'] = col_md_4_divs[3].find_next(class_='col-md').b.next_sibling.strip()
especialidades_div = str(x.find(class_='col-md-12', style='display: flex;'))
print(xxxxxxxxxx)
if especialidades_div.find('</span>') != -1:
especialidades = str(especialidades_div)
print(f"esp {especialidades}")
data['especialidades'] = especialidades
endereco_div = x.find_all(class_='col-md-7')
data['endereco'] = endereco_div[0].b.next_sibling.strip()
telefone_div = x.find_all(class_='row telefone')
data['telefone'] = telefone_div[0].b.next_sibling.strip()
data_list.append(data)
#CSV
df = pd.DataFrame(data_list)
csv_file_path = '------'
df.to_csv(csv_file_path)
print(df)
driver.quit()
In this code, I can get data, but this is the first page.
Details of HTML about pagination: I need extract 63846 pages, But doesn't change URL.
<div class="paginationjs-pages">
<ul>
<li class="paginationjs-page J-paginationjs-page active" data-num="1">
<a>
1
</a>
</li>
<li class="paginationjs-page J-paginationjs-page" data-num="2">
<a href="">
2
</a>
</li>
<li class="paginationjs-page J-paginationjs-page" data-num="3">
<a href="">
3
</a>
</li>
<li class="paginationjs-page J-paginationjs-page" data-num="4">
<a href="">
4
</a>
</li>
<li class="paginationjs-page J-paginationjs-page" data-num="5">
<a href="">
5
</a>
</li>
<li class="paginationjs-ellipsis disabled">
<a>
...
</a>
</li>
<li class="paginationjs-page paginationjs-last J-paginationjs-page" data-num="63846">
<a href="">
63846
</a>
</li>
</ul>
</div>