0

I am using RSelenium to submit forms on the UN treaty collection website and save the results. Everything works fine, beside the fact that the names of the treaties are truncated in my final table. Is is because there is a limitation on the number of characters that readHTML can read or I am doing something else wrong?

Here is a (hopefully) reproducible example:

###
###   RSelenium scraping of UN treaty collection
###

# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html

rm(list=ls())

###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")

library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")


# Start Selenium Server --------------------------------------------------------

checkForServer()  ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5) 
#remDrv$getStatus()  ## info connection, not necessary

# Simulate browser session and fill out form -----------------------------------

## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')

## check out what is of interest:

## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement()  ## match all the elements
Sys.sleep(5) 

## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn")    ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5) 

## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement()  ## number of pages
Sys.sleep(5) 

## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount")    ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")

## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login")    ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5) 

df_all <- data.frame()
###### need to run search for multiple countries
country_list <- c("Morocco", "Italy", "France")

for (i in country_list){
  Sys.sleep(5) 
  ## define keys to search
  keys <- paste(i, "Agreement promotion investment", sep=" ")

  ## search for files, one by one and save results
  webElem0$clearElement()
  webElem0$sendKeysToElement(list(keys, key = "enter"))
  Sys.sleep(20) 

  # check if the table is there
  doc<-htmlParse(remDrv$getPageSource()[[1]])  ## now parse html so that we can search it
  tables = readHTMLTable(doc)  ## extract all tables
  #names(tables)   ## names of all tables

  tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
  yes_no <- all(tableexists==F)
  yes_no

  if(yes_no==FALSE){
  ## copy table
  table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
  table$getElementAttribute("class")
  table$getElementAttribute("type")
  table$getElementAttribute("id")

  ## extract table of interest
  tabledat <-readHTMLTable(doc, stringsAsFactors = F , skip.rows=c(1))[[37]]
  df_all <- rbind(tabledat, df_all)
  }else{print("caccadicane")}
}

write.csv(df_all[,-(7:ncol(df_all))], ("un_bits.csv"))  

The result is:

       V1                                     V2          V3         V4         V5        V6
1 I-42051 Agreement between the Government of... See Details 08/07/1996 27/07/2000 Bilateral
2 I-35582 Agreement between the Government of... See Details 11/10/1995 22/06/1997 Bilateral
3 I-35481 Agreement between the Government of... See Details 30/11/1995 30/05/1997 Bilateral
4 I-23169 Agreement concerning the establishm... See Details 28/06/1980 28/06/1980 Bilateral
5 I-29086 Exchange of notes constituting an a... See Details 12/08/1985 12/08/1985 Bilateral
6 I-43258 Agreement on the promotion and prot... See Details 27/01/1999 08/05/2001 Bilateral

Why are the strings in V2 truncated?

desval
  • 2,345
  • 2
  • 16
  • 23

1 Answers1

0

Ok, after some time I found out that even if there is a limit on the ReadHTML command, it is not the reason why the text in this example is truncated. By inspecting with a bit more care the html file I found out the text is already truncated, while the full names are in the element "title".

The solution is therefore reading the text in each "title" to get the full name of the agreements. Below there is the code, if anyone is interested, with some other stuff added.

###
###   RSelenium scraping of UN treaty collection
###

# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html

rm(list=ls())

###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")

library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")


# Start Selenium Server --------------------------------------------------------

checkForServer()  ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5) 
#remDrv$getStatus()  ## info connection, not necessary

# Simulate browser session and fill out form -----------------------------------

## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')

## check out what is of interest:

## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement()  ## match all the elements
Sys.sleep(5) 

## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn")    ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5) 

## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement()  ## number of pages
Sys.sleep(5) 

## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount")    ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")

## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login")    ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5) 

df_all <- data.frame()
###### need to run search for multiple countries
#country_list <- c("Morocco", "Italy", "Brutto porco", "France")
names <- read.csv("participants_clean.csv")
country_list <- names$names
current_search  <- length(country_list)

for (i in country_list){

  #i <- "Morocco" 
  print("-------------------------")
  print("-------------------------")
  text <- paste("Still", current_search, "searches to do... ", sep=" ")
  print(text)
  text0 <- paste("Now looking for treaties signed by...  ", i , " ----------------------->>" , sep=" ")
  print(text0)
  Sys.sleep(5) 
  ## define keys to search
  keys <- paste(i, "Agreement promotion investment", sep=" ")

  ## search for files, one by one and save results
  webElem0$clearElement()
  webElem0$sendKeysToElement(list(keys, key = "enter"))
  Sys.sleep(20) 

  # check if the table is there
  doc<-htmlParse(remDrv$getPageSource()[[1]])  ## now parse html so that we can search it
  tables = readHTMLTable(doc)  ## extract all tables
  #names(tables)   ## names of all tables

  tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
  yes_no <- all(tableexists==F)
  yes_no

  if(yes_no==FALSE){
  ## copy table
  table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
  table$getElementAttribute("class")
  table$getElementAttribute("type")
  table$getElementAttribute("id")

  ## extract table of interest
  tabledat <-readHTMLTable(doc, stringsAsFactors = F )[[37]]
  treatfou <-nrow(tabledat)
  text1 <- paste("Amazing, I just found", treatfou - 1, " !!", sep=" ")
  print(text1)

  ## now need to extract the real names of the treaties: start from 2 to treatfound
  names_new <- vector(mode="character",length = treatfou)
  urls <- vector(mode="character",length = treatfou)

  for (jj in 2:treatfou) {
    cell_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[2]", sep="")
    cell_table <- remDrv$findElement(using = 'xpath', cell_add)
    names_new[[jj]] <- as.character(cell_table$getElementAttribute("title"))
  }

  ## now substitute in the real titles:
  names_new <- as.vector(unlist(names_new))
  tabledat$title <- names_new
  tabledat$party <- i

  ## get the link
  for (jj in 2:treatfou) {
    url_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[3]/a", sep="")
    url_add <- remDrv$findElement(using = 'xpath', url_add)
    gio <- unlist(url_add$getElementAttribute("href"))
    gio <- gsub("javascript:void%20window.open\\('","",gio)   ## need to excape the parenthesis with \\
    gio <- gsub("\\'.*", "", gio)  ## cancel everything after '
    urls[[jj]] <- paste0("https://treaties.un.org/Pages/",gio)
  }
  tabledat$url <-urls


  df_all <- rbind(tabledat[-(1),], df_all)
  }else{print("Too bad, there is nothing, I'll try with the next one :) " )}
  current_search <- current_search -1
}

write.csv(df_all[,-(7:10)], ("un_bits.csv"))
desval
  • 2,345
  • 2
  • 16
  • 23