I am trying to download some data from athena (AWS) - my account is managed by SAML authentication. I managed to get the authentication in place correctly (my credentials are stored in a profile called 'saml') My problem is actually related to the fetching of data - The table I am querying is only 12000 records and typically it should be very query but it appear to be very slow.
What could be my problem? Is there something wrong with my code ?
pkgs <- c('tidyverse',
'aws.s3',
'rJava',
'RJDBC')
# uncomment if needed
#install.packages(pkgs)
#install.packages("aws.signature", repos = c(cloudyr = "http://cloudyr.github.io/drat", getOption("repos")))
# load the packages
lapply(pkgs, library, character.only = TRUE)
URL <- 'https://s3.amazonaws.com/athena-downloads/drivers/JDBC/SimbaAthenaJDBC_2.0.9/AthenaJDBC42_2.0.9.jar'
# library(rJava)
# .jinit()
# .jclassLoader()$setDebug(1L)
fil <- basename(URL)
if (!file.exists(fil)) download.file(URL, fil)
drv <- JDBC(driverClass="com.simba.athena.jdbc.Driver", fil, identifier.quote="'")
provider <- "com.simba.athena.amazonaws.auth.profile.ProfileCredentialsProvider"
#provider <- "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
Sys.setenv("AWS_ACCESS_KEY_ID" = aws.signature::locate_credentials()$key)
Sys.setenv("AWS_SECRET_ACCESS_KEY" = aws.signature::locate_credentials()$secret)
Sys.setenv("AWS_SESSION_TOKEN" = aws.signature::locate_credentials()$session_token)
conn <- dbConnect(drv, 'jdbc:awsathena://athena.eu-west-1.amazonaws.com:443/',
s3_staging_dir="<QUERY-RESULTS-S3-BUCKET>",
aws_credentials_provider_class=provider,
user = Sys.getenv("AWS_ACCESS_KEY_ID"),
password = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
aws_credentials_provider_arguments='saml')
# get a list of all tables currently in Athena
# dbListTables(conn)
# enrich `dbGetQuery` to fetch in batch
setMethod("dbGetQuery", signature(conn="JDBCConnection", statement="character"), def=function(conn, statement, ...) {
r <- dbSendQuery(conn, statement, ...)
on.exit(.jcall(r@stat, "V", "close"))
if (conn@jc %instanceof% "com.amazonaws.athena.jdbc.AthenaConnection") fetch(r, -1, 999) # Athena can only pull 999 rows at a time
else fetch(r, -1)
})
fetch_data <- function(query, path, cache = TRUE) {
dataset <- dbGetQuery(conn, query)
if (cache == TRUE) {
dataset %>%
write_rds(path)
} else {
return (dataset)
}
}
# 0. Initial setup
clean_schema <- "clean_tst."
# 1. table_1 ----
query <- str_glue("SELECT * FROM ",
clean_schema,
"table_1")
path = "00_Data/athena/table_1.rds"
fetch_data(query, path)