0

I am having trouble getting this code to work. I am trying to download documents from the FAO website in the URL. Please can someone help me? I use MAC OS and my chrome version is Version 106.0.5249.103 (Official Build) (x86_64)

library(rvest)
library(httr2)
library(RSelenium)
library(stringr)

url <- "https://www.fao.org/faolex/country-profiles/general-profile/en/?iso3=NAM"
base_url <- "https://www.fao.org"

ids <- read_html(url) %>%
  html_elements(".doclink > a") %>%
  html_attr("href") %>%
  paste0(base_url, .)

grab_link <- function(page_url, s_ctl) {
  # load the target url
  s_ctl$navigate(page_url)
  
  # wait for the page load to complete
  Sys.sleep(4)
  
  # getPageSource returns a list with html as the first element
  page <- s_ctl$getPageSource()[[1]]
  
  # Using rvest
  read_html(page) %>%
    html_elements(".item-title > a") %>%
    html_attr("href") %>%
    url_parse() %>%
    purrr::pluck("query", "url")
}


selenium_driver <- rsDriver(
  browser = "chrome",
  chromever = "106.0.5249.61",
  port = 4444L, #4545L,
  verbose = FALSE,
  check = FALSE
)

# control the client browser
ctl_browser <- selenium_driver[["client"]]

links <- purrr::map_chr(ids, grab_link, ctl_browser)


# Stop selenium server and quit browser
selenium_driver[["server"]]$stop()
  • thanks @KJ but I don't understand what you mean? Would I need to do this for each of the subpages? Or do I substitute the first link with the one you suggested? Its my first time trying to scrape – Biandri Oct 11 '22 at 13:25

0 Answers0