What I have to do for my job is to do a webscraping of some news articles from the newspaper "Libero" on the website Pressreader Link by using some keywords in a precise period of time. After that, I should store them in a .csv file.
I have talked with a colleague of mine and he suggested I use the following R packages: rvest, Rselenium and httr. Additionally, he also suggested I have a look at this page for having an idea of what to do.
However, the structure of the website (the way the articles are shown) is pretty unusual and I am having a lot of problems in even conceiving the code that I could use... I have looked everywhere on the internet, asked on ChatGPT, and so on, but I haven't found a source that can actually be helpful.
So, after some research, I have found a code, but written for Phyton (for which my knowledge is even less). The code is developed is the following one: Link
I have edited the code according to my needs:
from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
import pyautogui
import os.path
def starttoend(start, end, year, month, day):
s_year = start[0:4]
s_mon = start[4:6]
s_day = start[6:8]
e_year = end[0:4]
e_mon = end[4:6]
e_day = end[6:8]
ret = []
for i in range(year.index(s_year), year.index(e_year)+1):
for j in range(month.index(s_mon), month.index(e_mon)+1):
if i == year.index(s_year) and j == month.index(s_mon):
for k in range(day.index(s_day), 31):
ret.append(year[i]+month[j]+day[k])
elif i == year.index(e_year) and j == month.index(e_mon):
for k in range(0, day.index(e_day)+1):
ret.append(year[i]+month[j]+day[k])
else:
for k in range(31):
ret.append(year[i]+month[j]+day[k])
return ret
#name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail = 0
dates = []
year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021","2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]
date_tul = starttoend(start, end, year, months, days)
dates.append(date_tul)
index = list(range(25))
#set up to save print as PDF file
settings = {
"appState": {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local"
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}
#change chrome printing option to minimize work.
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
#traverse through all papers
for i in range(len(papernames)):
#traverse through dates
for j in dates[i]:
count = 1
dobreak = False
for k in index:
if(dobreak):
break
try:
#run driver.
driver = webdriver.Chrome(r'C:\Users\cmosca\Desktop\python\packages\chromedriver-win32\chromedriver.exe', chrome_options=chrome_options)
driver.get("https://www.pressreader.com/" + papernames[i] +"/"+j+"/page/1/textview")
actions1 = webdriver.common.action_chains.ActionChains(driver)
actions2 = webdriver.common.action_chains.ActionChains(driver)
WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))
bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')
bottom_button.click()
time.sleep(2)
all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')
news = all_news[k]
first = True
article_id = news.get_attribute("article-id")
print(article_id)
actions1.move_to_element(news).perform()
news.click()
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//article[@aid="'+str(article_id)+'"]')))
time.sleep(2)
arti = driver.find_element_by_xpath('//article[@aid="'+str(article_id)+'"]')
head = arti.find_element_by_tag_name("hgroup")
time.sleep(1)
actions2.move_to_element(head).perform()
time.sleep(1)
actions2.context_click(head).perform()
time.sleep(2)
printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
printbutton.click()
time.sleep(1)
printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
printtext.click()
time.sleep(4)
name = ""
if(count < 10):
name = papernames[i]+"_"+j +"_"+"0"+ str(count)
pyautogui.typewrite(papernames[i]+"_"+j +"_"+"0"+ str(count))
else:
name = papernames[i]+ "_"+j +"_" + str(count)
pyautogui.typewrite(papernames[i]+ "_"+j +"_" + str(count))
time.sleep(1)
pyautogui.press('enter')
print("saved" + name)
time.sleep(10)
count+=1
cont_fail = 0
if k == len(all_news)-1:
driver.quit()
dobreak = True
break
driver.quit()
time.sleep(1)
except:
cont_fail += 1
print("failed on" + papernames[i]+j+str(k))
driver.quit()
if cont_fail > 5:
break
continue
But I keep getting the following error:
C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py"
Traceback (most recent call last):
File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 85, in <module>
failed onlibero200801010
driver = webdriver.Chrome(r'C:\Users\cmosca\Desktop\python\packages\chromedriver-win32\chromedriver.exe', chrome_options=chrome_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: WebDriver.__init__() got an unexpected keyword argument 'chrome_options'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 156, in <module>
driver.quit()
^^^^^^
NameError: name 'driver' is not defined. Did you mean: 'webdriver'?
Process finished with exit code 1
I hope that one of you will be able to help me. Thanks a lot to everyone in advance.
Have a nice day!
EDIT:
I have tried to follow the suggestion by @Shawn:
except:
cont_fail += 1
print("failed on" + papernames[i]+j+str(k))
driver.quit()
driver = webdriver.Chrome(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe',
chrome_options=chrome_options)
if cont_fail > 5:
break
continue
but now what I get is the following error:
C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py"
File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 161
except:
^
IndentationError: unindent does not match any outer indentation level
Process finished with exit code 1
Does anyone has an idea on how to fix it? Thanks in advance.
EDIT:
I have tried to do as said and, in addition, I have download the last version of chromedriver and added the path of the folder on OS. This is my current code:
from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome import service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.chrome.service import Service
import pyautogui
import os.path
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def starttoend(start, end, year, month, day):
s_year = start[0:4]
s_mon = start[4:6]
s_day = start[6:8]
e_year = end[0:4]
e_mon = end[4:6]
e_day = end[6:8]
ret = []
for i in range(year.index(s_year), year.index(e_year) + 1):
for j in range(month.index(s_mon), month.index(e_mon) + 1):
if i == year.index(s_year) and j == month.index(s_mon):
for k in range(day.index(s_day), 31):
ret.append(year[i] + month[j] + day[k])
elif i == year.index(e_year) and j == month.index(e_mon):
for k in range(0, day.index(e_day) + 1):
ret.append(year[i] + month[j] + day[k])
else:
for k in range(31):
ret.append(year[i] + month[j] + day[k])
return ret
# name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail: int = 0
dates = []
year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
"2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]
date_tul = starttoend(start, end, year, months, days)
dates.append(date_tul)
index = list(range(25))
# set up to save print as PDF file
settings = {
"appState": {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local"
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}
#service = Service(executable_path=r'""C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe""')
#service_obj = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')
#service = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')
#driver = webdriver.Chrome(service = service)
# change chrome printing option to minimize work.
chrome_options: Options = webdriver.ChromeOptions()
#chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
Service(executable_path=r'""C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe""')
driver: WebDriver = webdriver.Chrome(service = service,
options=chrome_options)
# traverse through all papers
for i in range(len(papernames)):
# traverse through dates
for j in dates[i]:
count = 1
dobreak = False
for k in index:
if (dobreak):
break
try:
driver.get("https://www.pressreader.com/ita/" + papernames[i] + "/" + j + "/page/1/textview")
actions1 = webdriver.common.action_chains.ActionChains(driver)
actions2 = webdriver.common.action_chains.ActionChains(driver)
WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))
bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')
bottom_button.click()
time.sleep(2)
all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')
news = all_news[k]
first = True
article_id = news.get_attribute("article-id")
print(article_id)
actions1.move_to_element(news).perform()
news.click()
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, '//article[@aid="' + str(article_id) + '"]')))
time.sleep(2)
arti = driver.find_element_by_xpath('//article[@aid="' + str(article_id) + '"]')
head = arti.find_element_by_tag_name("hgroup")
time.sleep(1)
actions2.move_to_element(head).perform()
time.sleep(1)
actions2.context_click(head).perform()
time.sleep(2)
printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
printbutton.click()
time.sleep(1)
printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
printtext.click()
time.sleep(4)
name = ""
if (count < 10):
name = papernames[i] + "_" + j + "_" + "0" + str(count)
pyautogui.typewrite(papernames[i] + "_" + j + "_" + "0" + str(count))
else:
name = papernames[i] + "_" + j + "_" + str(count)
pyautogui.typewrite(papernames[i] + "_" + j + "_" + str(count))
time.sleep(1)
pyautogui.press('enter')
print("saved" + name)
time.sleep(10)
count += 1
cont_fail = 0
if k == len(all_news) - 1:
driver.quit()
dobreak = True
break
driver.quit()
time.sleep(1)
except:
cont_fail += 1
print("failed on" + papernames[i] + j + str(k))
driver.quit()
if cont_fail > 5:
break
continue
And I keep receiving this error:
C:\Users\cmosca\AppData\Local\Programs\Python\Python311\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py"
Traceback (most recent call last):
File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 99, in <module>
driver: WebDriver = webdriver.Chrome(service = service,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
super().__init__(
File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
self.service.path = DriverFinder.get_path(self.service, options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 36, in get_path
path = service.path
^^^^^^^^^^^^
AttributeError: module 'selenium.webdriver.chrome.service' has no attribute 'path'
Process finished with exit code 1
I have the last version of the Selenium package installed. Can anyone help me? Thank you in advance.