4

I would like to get the output from POST request using httr from following site:

http://www.e-grunt.ba

You can see submit form when you click "ZK Ulošci".

There I would like to send POST request and get the output. For example, you can select anything from drop down window and enter 1 in filed "Broj Uloška", and than click "Traži".

Here is my try:

library(httr)
library(tidyverse)
library(rvest)

    output <- httr::POST(
      "http://www.e-grunt.ba/home.jsf",
      body = list(
        "form:court_focus" = "440",
        "form:cuTransferLast" = "17.07.2019",
        "form:municipality_input" = "4400000001",
        "form:mpart_focus" = "44000087",
        "form:folder" = 1,
        `recaptcha-token` = "some token",
        submit = "form:j_idt61"
        ),
      add_headers(Referer = "http://www.e-grunt.ba/"),
      encode = "form",
      verbose()
    )

But this just returns content of the home page.

I know it is easier with (R)Selenium, but I would like to do it with httr and POST if it is possible.

kstew
  • 1,104
  • 6
  • 21
Mislav
  • 1,533
  • 16
  • 37
  • When I look at it there is a captcha. And correctly completing the captcha feeds into the POST request sent. – QHarr Jul 21 '19 at 19:30
  • 1
    Yes, there is a captcha. I have `recaptcha-token` argument in POST body. But it is changing with every request, so I couldn't pt some fix key. – Mislav Jul 21 '19 at 19:46
  • The point of the captcha is presumably to stop scraping. Have you checked the terms and conditions for the site? – QHarr Jul 21 '19 at 20:00
  • 2
    They don't have terms and conditions. That's the reason why I think it's Ok to scrap the site. Second, it is publicly available register. Third, there is a law that says that government publicly available information are open. Maybe they put captcha to overcome issue with too many requests at the time. – Mislav Jul 21 '19 at 20:04
  • 2
    You are not going to get very much help on this site. First, the captcha is a clear indicator that the website does not allow scraping. Second, if you claim the data is public record then you should just request the full dataset you want from the source of the data directly instead of trying to circumvent their website protections. – Adam Sampson Aug 22 '19 at 18:40
  • 1
    As already said, you will not be able to go through the captcha. If you are interested, I can also explain you the other issues in your approach, but this would be just for learning, I'll no give you a solution. And even if I could I wouldn't because if the provider implements such a security system it is exactly to prevent what you are trying to do. – Chelmy88 Aug 23 '19 at 21:36
  • @Chelmy88 please explain me what are the issues with my approach. Is it even possible to scrap without Selenium? – Mislav Aug 27 '19 at 10:47

1 Answers1

0

I have found the way to scrape this ASP.net site. I am providing the code if somebody will need something similar:

start_session <- function() {
  p <- html_session(
    "http://www.e-grunt.ba", 
    user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
  )
  viewState <- p %>% html_nodes("input") %>% .[[2]] %>% html_attr("value") 
  p <- rvest:::request_POST(
    p, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      "javax.faces.partial.ajax" = "true",
      "javax.faces.source" = "j_idt8:j_idt15",
      "javax.faces.partial.execute" = "@all",
      "javax.faces.partial.render" = "content",
      "j_idt8:j_idt15" = "j_idt8:j_idt15",
      "j_idt8" = 'j_idt8',
      'javax.faces.ViewState' = viewState
    ),
    encode = "form"
  )
  attr(p, "viewState") <- viewState
  p
}

# EXTRACT METADATA --------------------------------------------------------

p <- start_session()
name_value_pairs <- function(html, css, cnames) {
  x <- read_html(html) %>% 
    html_nodes(css) %>% 
    html_children() %>% 
    html_attr("value")
  y <- read_html(html) %>% 
    html_nodes(css) %>% 
    html_children() %>% 
    html_text()
  df <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
  df <- df[df[, 1] != -1, ]
  colnames(df) <- cnames
  df
}
courts <- name_value_pairs(p$response$content, css = '[id="form:court_input"]', cnames = c("court_id", "court"))

metadata_post <- function(session_zk, view_state, id) {
  p <- rvest:::request_POST(
    session_zk, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      'javax.faces.partial.ajax' = 'true',
      'javax.faces.source' = 'form:court',
      'javax.faces.partial.execute' = 'form:court',
      'javax.faces.partial.render' = 'msgs msgsBottom form:municipality form:mpart form:cuTransferLast',
      'javax.faces.behavior.event' = 'change',
      'javax.faces.partial.event' = 'change',
      'form' = 'form',
      'g-recaptcha-response' = '',
      'form:court_focus' = '',
      'form:court_input' = id,
      'form:cuTransferLast' = '',
      'form:municipality_focus' = '',
      'form:mpart_focus' = '',
      'form:folder' = '',
      'form:parcel' = '',
      'form:parcelSub' = '',
      'javax.faces.ViewState' = view_state
    ),
    encode = "form"
  )
  return(p)
}

muni_post <- function(session_zk, view_state, id, muni_id) {
  p <- rvest:::request_POST(
    session_zk, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      'javax.faces.partial.ajax' = 'true',
      'javax.faces.source' = 'form:municipality',
      'javax.faces.partial.execute' = 'form:municipality',
      'javax.faces.partial.render' = 'msgs msgsBottom form:mpart',
      'javax.faces.behavior.event' = 'change',
      'javax.faces.partial.event' = 'change',
      'form' = 'form',
      'g-recaptcha-response' = '',
      'form:court_focus' = '',
      'form:court_input' = id,
      'form:cuTransferLast' = '',
      'form:municipality_focus' = '',
      'form:municipality_input' = muni_id,
      'form:mpart_focus' = '',
      'form:folder' = '',
      'form:parcel' = '',
      'form:parcelSub' = '',
      'javax.faces.ViewState' = view_state
    ),
    encode = "form"
  )
  return(p)
}


metadata_i <- list()
for (i in seq_along(courts$court_id)) {
  print(i)
  p <- metadata_post(p, attributes(p)$viewState, courts$court_id[i])
  muni <- name_value_pairs(p$response$content, css = '[id="form:municipality_input"]', cnames = c("muni_id", "muni"))
  
  if (nrow(muni) > 1) {
    muni_ko <- list()
    for (j in seq_along(muni$muni_id)) {
      # print(j)
      p <- muni_post(p, attributes(p)$viewState, courts$court_id[i], muni$muni_id[j])
      ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
      if (nrow(ko) == 0) {
        ko <- data.frame(ko_id = NA, ko = NA, stringsAsFactors = FALSE)
      }
      muni_ko[[j]] <-  cbind.data.frame(muni[j, ], ko, stringsAsFactors = FALSE)
    }
    metadata_i[[i]] <- cbind.data.frame(courts[i, ], do.call(rbind, muni_ko), stringsAsFactors = FALSE)
  } else {
    ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
    meta <- cbind.data.frame(courts[i, ], muni, stringsAsFactors = FALSE)
    metadata_i[[i]] <- cbind.data.frame(meta, ko, stringsAsFactors = FALSE)
  }
}
metadata <- do.call(rbind, metadata_i)

metadata_post <- function(session_zk, view_state, recaptcha, court,
                          date = as.character(format.Date(Sys.Date() - 4, "%d.%m.%Y")),
                          muni, ko, zk
) {
  p <- rvest:::request_POST(
    session_zk, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      'form' = 'form',
      'g-recaptcha-response' = recaptcha,
      'form:court_focus' = '',
      'form:court_input' = court,
      'form:cuTransferLast' = date,
      'form:municipality_focus' = '',
      'form:municipality_input' = muni,
      'form:mpart_focus' = '',
      'form:mpart_input' = ko,
      'form:folder' = zk,
      'form:parcel' = '',
      'form:parcelSub' = '',
      'form:j_idt61' = '',
      'javax.faces.ViewState' = view_state
    ),
    encode = "form"
  )
  return(p)
}

# example
result <- break_captcha()
p <- metadata_post(session_zk = p, view_state = attributes(p)$viewState, 
                   recaptcha = result, court = metadata$court_id[i],
                   muni = metadata$muni_id[i], ko =  metadata$ko_id[i], zk = j)
Mislav
  • 1,533
  • 16
  • 37