0

I haven't scraped anything in a couple years at least, so I'm hoping for pointers on how to scrape the about 260k row table here, that seems possibly on the irregular side of tables, as apparent output from ESRI/ArcGIS (I see 'esri' upon inspecting elements). I'm not sure what type of table it is and if rvest is even an appropriate library for this page? I tried SelectorGadget to find the appropriate name of the table element, but was unsuccessful.

https://gispublic.waterboards.ca.gov/portal/home/item.html?id=2d6b184566c740c988b9a2f1b2a8d4a3#data

library(rvest)
html <- read_html("https://gispublic.waterboards.ca.gov/portal/home/item.html?id=2d6b184566c740c988b9a2f1b2a8d4a3#data")
table <- html %>% html_element("unable to find the CSS Selector/X Copy path") %>% html_table()

Am I on the right track at all?

dbo
  • 1,174
  • 1
  • 11
  • 19
  • See this on how to read data from ArcGIS instead of scraping: https://stackoverflow.com/questions/50161492/how-do-i-scrape-data-from-an-arcgis-online-map – harre Apr 27 '23 at 20:20
  • 1
    It is probably easier to use the REST API than to try to scrape the data from the web frontend. See https://gispublic.waterboards.ca.gov/portalserver/sdk/rest/index.html#/Get_started/02ss00000048000000/ for an overview. In case you are willing to use Python then there is a package that offers download functionality: https://developers.arcgis.com/python/guide/download-data. – Marijn Apr 27 '23 at 20:20
  • And here is the link to the GAMA REST API: https://gispublic.waterboards.ca.gov/portalserver/rest/services/GAMA/All_Wells_on_the_GAMA_Groundwater_Information_System/MapServer/0 – harre Apr 27 '23 at 20:21
  • I cannot collect all in one go, so you'll probably need to use the fields ResultRecordCount and ResultOffset and get the data over several calls :) – harre Apr 27 '23 at 20:51

1 Answers1

0

I have been able to extract information from the table with the following code :

library(RSelenium)
shell('docker run -d -p 4446:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4446L, browserName = "firefox")

url <- "https://gispublic.waterboards.ca.gov/portal/home/item.html?id=2d6b184566c740c988b9a2f1b2a8d4a3#data"
remDr$open()
remDr$navigate(url)

Sys.sleep(40)

web_Obj <- remDr$findElement("id", "dgrid_0")
text <- strsplit(web_Obj$getElementText(), "\n")[[1]]
text <- text[text != ""]
text[1 : 100]

1] "DATASET NAME"                               "WELL CATEGORY"                              "DATA SOURCE"                               
  [4] "WELL ID"                                    "LATITUDE"                                   "LONGITUDE"                                 
  [7] "WELL DEPTH (FT)"                            "TOP DEPTH OF SCREEN (FT)"                   "BOTTOM DEPTH OF SCREEN (FT)"               
 [10] "COUNTY"                                     "DWR BASIN"                                  "REGIONAL BOARD"                            
 [13] "SENATE DISTRICT"                            "HVA"                                        "GAMA STUDY AREA"                           
 [16] "ASSEMBLY DISTRICT"                          "GSA"                                        "DWR REGION"                                
 [19] "globalid"                                   " "                                          "OBJECTID"                                  
 [22] "WB_ILRP"                                    "DOMESTIC"                                   "GeoTracker"                                
 [25] "AGW080010456-WELL_#25"                      "36.953669"                                  "-120.115053"                               
 [28] "MADERA"                                     "SAN JOAQUIN VALLEY - MADERA (5-022.06)"     "CENTRAL VALLEY"                            
 [31] "12"                                         "SAN JOAQUIN VALLEY REGION NORTH"            "MADERA/CHOWCHILLA"                         
 [34] "5"                                          "San Joaquin River"                          "{8DF0AE26-E11D-40C4-897F-960A9D1C6643}"    
 [37] "1"                                          "WB_ILRP"                                    "DOMESTIC"                                  
 [40] "GeoTracker"                                 "AGW080010470-SHOP"                          "37.042592"                                 
 [43] "-120.470131"                                "MADERA"                                     "SAN JOAQUIN VALLEY - CHOWCHILLA (5-022.05)"
 [46] "CENTRAL VALLEY"                             "12"                                         "MADERA/CHOWCHILLA"                         
 [49] "5"                                          "San Joaquin River"                          "{8386980C-BA92-4E61-9B78-37263E9068FB}"    
 [52] "2"                                          "WB_ILRP"                                    "DOMESTIC"                                  
 [55] "GeoTracker"                                 "AGW080010512-JB_HOME"                       "36.930254"                                 
 [58] "-120.076970"                                "MADERA"                                     "SAN JOAQUIN VALLEY - MADERA (5-022.06)"    
 [61] "CENTRAL VALLEY"                             "12"                                         "SAN JOAQUIN VALLEY REGION NORTH"           
 [64] "MADERA/CHOWCHILLA"                          "5"                                          "San Joaquin River"                         
 [67] "{220B3E77-705B-4156-8266-0DBAFBFC563C}"     "3"                                          "WB_ILRP"                                   
 [70] "DOMESTIC"                                   "GeoTracker"                                 "AGW080010551-SHOP"                         
 [73] "37.076500"                                  "-120.173900"                                "MADERA"                                    
 [76] "SAN JOAQUIN VALLEY - MADERA (5-022.06)"     "CENTRAL VALLEY"                             "12"                                        
 [79] "SAN JOAQUIN VALLEY REGION NORTH"            "MADERA/CHOWCHILLA"                          "5"                                         
 [82] "San Joaquin River"                          "{4F46FFB7-7135-4629-8817-0AE7F23A7670}"     "4"                                         
 [85] "WB_ILRP"                                    "DOMESTIC"                                   "GeoTracker"                                
 [88] "AGW080010702-HOUSE WELL"                    "36.931400"                                  "-120.082200"                               
 [91] "MADERA"                                     "SAN JOAQUIN VALLEY - MADERA (5-022.06)"     "CENTRAL VALLEY"                            
 [94] "12"                                         "SAN JOAQUIN VALLEY REGION NORTH"            "MADERA/CHOWCHILLA"                         
 [97] "5"                                          "San Joaquin River"                          "{0B3DD4D9-A516-42E7-B869-FED986A4FD50}"    
[100] "5"

I also have been able to extract information from the table with the following code :

library(RDCOMClient)
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)

Sys.sleep(40)

doc <- IEApp$Document()
web_Obj <- doc$getElementById("dgrid_0")
text <- strsplit(web_Obj$innerText(), "\r\n")[[1]]
text <- text[text != ""]
text[1 : 100]

[1] "DATASET NAME"                               "WELL CATEGORY"                              "DATA SOURCE"                               
  [4] "WELL ID"                                    "LATITUDE"                                   "LONGITUDE"                                 
  [7] "WELL DEPTH (FT)"                            "TOP DEPTH OF SCREEN (FT)"                   "BOTTOM DEPTH OF SCREEN (FT)"               
 [10] "COUNTY"                                     "DWR BASIN"                                  "REGIONAL BOARD"                            
 [13] "SENATE DISTRICT"                            "HVA"                                        "GAMA STUDY AREA"                           
 [16] "ASSEMBLY DISTRICT"                          "GSA"                                        "DWR REGION"                                
 [19] "globalid"                                   " "                                          "OBJECTID"                                  
 [22] "WB_ILRP"                                    "DOMESTIC"                                   "GeoTracker"                                
 [25] "AGW080010456-WELL_#25"                      "36.953669"                                  "-120.115053"                               
 [28] "MADERA"                                     "SAN JOAQUIN VALLEY - MADERA (5-022.06)"     "CENTRAL VALLEY"                            
 [31] "12"                                         "SAN JOAQUIN VALLEY REGION NORTH"            "MADERA/CHOWCHILLA"                         
 [34] "5"                                          "San Joaquin River"                          "{8DF0AE26-E11D-40C4-897F-960A9D1C6643}"    
 [37] "1"                                          "WB_ILRP"                                    "DOMESTIC"                                  
 [40] "GeoTracker"                                 "AGW080010470-SHOP"                          "37.042592"                                 
 [43] "-120.470131"                                "MADERA"                                     "SAN JOAQUIN VALLEY - CHOWCHILLA (5-022.05)"
 [46] "CENTRAL VALLEY"                             "12"                                         "MADERA/CHOWCHILLA"                         
 [49] "5"                                          "San Joaquin River"                          "{8386980C-BA92-4E61-9B78-37263E9068FB}"    
 [52] "2"                                          "WB_ILRP"                                    "DOMESTIC"                                  
 [55] "GeoTracker"                                 "AGW080010512-JB_HOME"                       "36.930254"                                 
 [58] "-120.076970"                                "MADERA"                                     "SAN JOAQUIN VALLEY - MADERA (5-022.06)"    
 [61] "CENTRAL VALLEY"                             "12"                                         "SAN JOAQUIN VALLEY REGION NORTH"           
 [64] "MADERA/CHOWCHILLA"                          "5"                                          "San Joaquin River"                         
 [67] "{220B3E77-705B-4156-8266-0DBAFBFC563C}"     "3"                                          "WB_ILRP"                                   
 [70] "DOMESTIC"                                   "GeoTracker"                                 "AGW080010551-SHOP"                         
 [73] "37.076500"                                  "-120.173900"                                "MADERA"                                    
 [76] "SAN JOAQUIN VALLEY - MADERA (5-022.06)"     "CENTRAL VALLEY"                             "12"                                        
 [79] "SAN JOAQUIN VALLEY REGION NORTH"            "MADERA/CHOWCHILLA"                          "5"                                         
 [82] "San Joaquin River"                          "{4F46FFB7-7135-4629-8817-0AE7F23A7670}"     "4"                                         
 [85] "WB_ILRP"                                    "DOMESTIC"                                   "GeoTracker"                                
 [88] "AGW080010702-HOUSE WELL"                    "36.931400"                                  "-120.082200"                               
 [91] "MADERA"                                     "SAN JOAQUIN VALLEY - MADERA (5-022.06)"     "CENTRAL VALLEY"                            
 [94] "12"                                         "SAN JOAQUIN VALLEY REGION NORTH"            "MADERA/CHOWCHILLA"                         
 [97] "5"                                          "San Joaquin River"                          "{0B3DD4D9-A516-42E7-B869-FED986A4FD50}"    
[100] "5"
Emmanuel Hamel
  • 1,769
  • 7
  • 19