Webscraping of news articles on press reader with Python

Question

What I have to do for my job is to do a webscraping of some news articles from the newspaper "Libero" on the website Pressreader Link by using some keywords in a precise period of time. After that, I should store them in a .csv file.

I have talked with a colleague of mine and he suggested I use the following R packages: rvest, Rselenium and httr. Additionally, he also suggested I have a look at this page for having an idea of what to do.

However, the structure of the website (the way the articles are shown) is pretty unusual and I am having a lot of problems in even conceiving the code that I could use... I have looked everywhere on the internet, asked on ChatGPT, and so on, but I haven't found a source that can actually be helpful.

So, after some research, I have found a code, but written for Phyton (for which my knowledge is even less). The code is developed is the following one: Link

I have edited the code according to my needs:

from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
import pyautogui
import os.path

def starttoend(start, end, year, month, day):
    s_year = start[0:4]
    s_mon = start[4:6]
    s_day = start[6:8]
    e_year = end[0:4]
    e_mon = end[4:6]
    e_day = end[6:8]
    ret = []
    for i in range(year.index(s_year), year.index(e_year)+1):
        for j in range(month.index(s_mon), month.index(e_mon)+1):
            if i == year.index(s_year) and j == month.index(s_mon):
                for k in range(day.index(s_day), 31):
                    ret.append(year[i]+month[j]+day[k])
            elif i == year.index(e_year) and j == month.index(e_mon):
                for k in range(0, day.index(e_day)+1):
                    ret.append(year[i]+month[j]+day[k])
            else:
                for k in range(31):
                    ret.append(year[i]+month[j]+day[k])
    return ret


#name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail = 0
dates = []

year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021","2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]


date_tul = starttoend(start, end, year, months, days)

dates.append(date_tul)

index = list(range(25))

#set up to save print as PDF file
settings = {
    "appState": {
        "recentDestinations": [{
            "id": "Save as PDF",
            "origin": "local"
        }],
        "selectedDestinationId": "Save as PDF",
        "version": 2
    }
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}

#change chrome printing option to minimize work.
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')

#traverse through all papers
for i in range(len(papernames)):
    #traverse through dates
    for j in dates[i]:
        count = 1
        dobreak = False
        for k in index:
            if(dobreak):
                break
            try:
                #run driver.
                driver = webdriver.Chrome(r'C:\Users\cmosca\Desktop\python\packages\chromedriver-win32\chromedriver.exe', chrome_options=chrome_options)
                driver.get("https://www.pressreader.com/" + papernames[i] +"/"+j+"/page/1/textview")
                actions1 = webdriver.common.action_chains.ActionChains(driver)
                actions2 = webdriver.common.action_chains.ActionChains(driver)

                WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))

                bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')

                bottom_button.click()

                time.sleep(2)

                all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
                all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')

                news = all_news[k]
                first = True

                article_id = news.get_attribute("article-id")
                print(article_id)
                actions1.move_to_element(news).perform()
                news.click()



                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//article[@aid="'+str(article_id)+'"]')))
                time.sleep(2)
                arti = driver.find_element_by_xpath('//article[@aid="'+str(article_id)+'"]')
                head = arti.find_element_by_tag_name("hgroup")
                time.sleep(1)
                actions2.move_to_element(head).perform()
                time.sleep(1)
                actions2.context_click(head).perform()

                time.sleep(2)
                printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
                printbutton.click()

                time.sleep(1)

                printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
                printtext.click()

                time.sleep(4)
                name = ""
                if(count < 10):
                    name = papernames[i]+"_"+j +"_"+"0"+ str(count)
                    pyautogui.typewrite(papernames[i]+"_"+j +"_"+"0"+ str(count))
                else:
                    name = papernames[i]+ "_"+j +"_" + str(count)
                    pyautogui.typewrite(papernames[i]+ "_"+j +"_" + str(count))


                time.sleep(1)
                pyautogui.press('enter')
                print("saved" + name)

                time.sleep(10)

                count+=1
                cont_fail = 0
                if k == len(all_news)-1:
                    driver.quit()
                    dobreak = True
                    break
                driver.quit()
                time.sleep(1)
            except:
                cont_fail += 1
                print("failed on" + papernames[i]+j+str(k))
                driver.quit()
                if cont_fail > 5:
                    break
                continue

But I keep getting the following error:

C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 85, in <module>
failed onlibero200801010
    driver = webdriver.Chrome(r'C:\Users\cmosca\Desktop\python\packages\chromedriver-win32\chromedriver.exe', chrome_options=chrome_options)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: WebDriver.__init__() got an unexpected keyword argument 'chrome_options'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 156, in <module>
    driver.quit()
    ^^^^^^
NameError: name 'driver' is not defined. Did you mean: 'webdriver'?

Process finished with exit code 1

I hope that one of you will be able to help me. Thanks a lot to everyone in advance.

Have a nice day!

EDIT:

I have tried to follow the suggestion by @Shawn:

except:
               cont_fail += 1
               print("failed on" + papernames[i]+j+str(k))
               driver.quit()
driver = webdriver.Chrome(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe',
                         chrome_options=chrome_options)
if cont_fail > 5:
                   break
               continue

but now what I get is the following error:

C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
 File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 161
   except:
          ^
IndentationError: unindent does not match any outer indentation level

Process finished with exit code 1

Does anyone has an idea on how to fix it? Thanks in advance.

EDIT:

I have tried to do as said and, in addition, I have download the last version of chromedriver and added the path of the folder on OS. This is my current code:


from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome import service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.chrome.service import Service



import pyautogui
import os.path

import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



def starttoend(start, end, year, month, day):
   s_year = start[0:4]
   s_mon = start[4:6]
   s_day = start[6:8]
   e_year = end[0:4]
   e_mon = end[4:6]
   e_day = end[6:8]
   ret = []
   for i in range(year.index(s_year), year.index(e_year) + 1):
       for j in range(month.index(s_mon), month.index(e_mon) + 1):
           if i == year.index(s_year) and j == month.index(s_mon):
               for k in range(day.index(s_day), 31):
                   ret.append(year[i] + month[j] + day[k])
           elif i == year.index(e_year) and j == month.index(e_mon):
               for k in range(0, day.index(e_day) + 1):
                   ret.append(year[i] + month[j] + day[k])
           else:
               for k in range(31):
                   ret.append(year[i] + month[j] + day[k])
   return ret


# name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail: int = 0
dates = []

year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
       "2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
       "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]

date_tul = starttoend(start, end, year, months, days)

dates.append(date_tul)

index = list(range(25))

# set up to save print as PDF file
settings = {
   "appState": {
       "recentDestinations": [{
           "id": "Save as PDF",
           "origin": "local"
       }],
       "selectedDestinationId": "Save as PDF",
       "version": 2
   }
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}

#service = Service(executable_path=r'""C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe""')

#service_obj = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#driver = webdriver.Chrome(service = service)


# change chrome printing option to minimize work.
chrome_options: Options = webdriver.ChromeOptions()
#chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
Service(executable_path=r'""C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe""')
driver: WebDriver = webdriver.Chrome(service = service,
                         options=chrome_options)

# traverse through all papers
for i in range(len(papernames)):
   # traverse through dates
   for j in dates[i]:
       count = 1
       dobreak = False
       for k in index:
           if (dobreak):
               break

           try:

               driver.get("https://www.pressreader.com/ita/" + papernames[i] + "/" + j + "/page/1/textview")
               actions1 = webdriver.common.action_chains.ActionChains(driver)
               actions2 = webdriver.common.action_chains.ActionChains(driver)

               WebDriverWait(driver, 60).until(
                   EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))

               bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')

               bottom_button.click()

               time.sleep(2)

               all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
               all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')

               news = all_news[k]
               first = True

               article_id = news.get_attribute("article-id")
               print(article_id)
               actions1.move_to_element(news).perform()
               news.click()

               WebDriverWait(driver, 20).until(
                   EC.presence_of_element_located((By.XPATH, '//article[@aid="' + str(article_id) + '"]')))
               time.sleep(2)
               arti = driver.find_element_by_xpath('//article[@aid="' + str(article_id) + '"]')
               head = arti.find_element_by_tag_name("hgroup")
               time.sleep(1)
               actions2.move_to_element(head).perform()
               time.sleep(1)
               actions2.context_click(head).perform()

               time.sleep(2)
               printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
               printbutton.click()

               time.sleep(1)

               printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
               printtext.click()

               time.sleep(4)
               name = ""
               if (count < 10):
                   name = papernames[i] + "_" + j + "_" + "0" + str(count)
                   pyautogui.typewrite(papernames[i] + "_" + j + "_" + "0" + str(count))
               else:
                   name = papernames[i] + "_" + j + "_" + str(count)
                   pyautogui.typewrite(papernames[i] + "_" + j + "_" + str(count))

               time.sleep(1)
               pyautogui.press('enter')
               print("saved" + name)

               time.sleep(10)

               count += 1
               cont_fail = 0
               if k == len(all_news) - 1:
                   driver.quit()
                   dobreak = True
                   break
               driver.quit()
               time.sleep(1)


           except:

               cont_fail += 1

               print("failed on" + papernames[i] + j + str(k))

               driver.quit()

           if cont_fail > 5:
               break

           continue

And I keep receiving this error:

C:\Users\cmosca\AppData\Local\Programs\Python\Python311\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
 File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 99, in <module>
   driver: WebDriver = webdriver.Chrome(service = service,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
   super().__init__(
 File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
   self.service.path = DriverFinder.get_path(self.service, options)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 36, in get_path
   path = service.path
          ^^^^^^^^^^^^
AttributeError: module 'selenium.webdriver.chrome.service' has no attribute 'path'

Process finished with exit code 1

I have the last version of the Selenium package installed. Can anyone help me? Thank you in advance.

score -1 · Answer 1 · answered Aug 21 '23 at 12:59

driver.quit()

Above line inside the except block doesn't have access to the driver object declared within try block.

Place the below code outside try block, so that it can be accessed within except block.

driver = webdriver.Chrome(r'C:\Users\cmosca\Desktop\python\packages\chromedriver-win32\chromedriver.exe', chrome_options=chrome_options)

Webscraping of news articles on press reader with Python

1 Answers1