Trying to create R UDF in Vertica

Question

So I have a long function that I run in R every month. My goal is to create a Vertica UDF using vertica's ability to run functions written in R. My hope is that this can then be automated from my companies' data warehouse. I've looked all over the internet for an example that resembles mine but cannot find one. My function takes two dataframes as input and one dataframe as output. Below is the function and factory function code.

Any help would be appreciated. Thanks, Ben

TV_Attribution_Function <- function(MAP.data, tv_data) {
    # Inputs are two queries
    # MAP.data = visits data
    # tv_data = Data with TV spots
  MAP.data$Date_time <- as.POSIXct(as.character(MAP.data$Date_time), format="%F %R")
  tv_data$IMPRESSIONS <- as.numeric(tv_data$IMPRESSIONS)
  tv_data$Date_time <- as.POSIXct(as.character(tv_data$Date_time), format="%F %R")

  missing <- tv_data[tv_data$IMPRESSIONS <= 0,]


  tv_data[which(tv_data$IMPRESSIONS == 0),'IMPRESSIONS'] <- 1

  #replace nas w/ 0
  tv_data[is.na(tv_data)] <- 0
  MAP.data[is.na(MAP.data)] <- 0



  for(i in c(1:8,10:12)){
    if(class(tv_data[,i])[1] == 'character'){tv_data[,i] <- as.factor(tv_data[,i])}
  }
  tv_data$Feed <- factor(tv_data$Feed, levels = c("",unique(tv_data$Feed)))
  tv_data$SPOT_LENGTH <- as.integer(tv_data$SPOT_LENGTH)

  for(i in 2:9){
    if(class(MAP.data[,i])[1] == 'character'){MAP.data[,i] <- as.factor(MAP.data[,i])}
    if(typeof(MAP.data[,i])[1] == 'double'){MAP.data[,i] <- as.integer(MAP.data[,i])}


  }

  visits_data <- MAP.data

  span = 0.2
  sd_x = 1.0
  span_b = 0.35
  sd_b = 1.0
  minutes_gap = 5

  days <- unique(visits_data$Day_Only, na.rm=TRUE)
  final_outcome <- data.frame()

  for (i in 1:length(days)){
    outcome_day <- subset(visits_data, Day_Only == days[i])
    outcome_2 <- outcome_day$Visits_Count
    outcome_2[which(is.na(outcome_2))] <- 0
    t <- 1:nrow(outcome_day)
    output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
    v.lo <- loess(outcome_2 ~ t, span = span)
    v.sd <- sd(v.lo$residuals)
    baseline <- v.lo$fitted + sd_x * v.sd
    direct_response <- outcome_2 - baseline
    direct_response[direct_response < 0] <- 0
    direct_response <- data.frame(direct_response)
    colnames(direct_response) <- c('mapped_visits')
    output_set <- cbind(output_set, direct_response)
    final_outcome <- rbind(final_outcome, output_set)
  }
  colnames(final_outcome)[1] <- c("Date_time")



  temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
  t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
  t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))

  for (i in 1:nrow(t_imps_plus)){
    t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
  }

  tv_data$mapped_visits <- rep(0,nrow(tv_data))

  for (n in (1:nrow(tv_data))){

    num <- match(tv_data[n,1], t_imps_plus$Date_time)
    if (is.na(num) == FALSE) {   
      tv_data$mapped_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap * sum(t_imps_plus$mapped_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])     
    } else {tv_data$mapped_visits[n] <- NA}  
  }


  #Next for visits that resulted in a seeker signup

  final_outcome <- data.frame()

  # Loop for each day to create baseline visits and detect spikes
  for(i in 1:length(days)){
    outcome_day <- subset(visits_data, Day_Only == days[i])
    outcome_2 <- outcome_day$new_seekers
    outcome_2[which(is.na(outcome_2))] <- 0
    t <- c(1:length(outcome_2))
    output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
    v.lo <- loess(outcome_2 ~ t, span = span_b)
    v.sd <- sd(v.lo$residuals)
    baseline <- v.lo$fitted + sd_b * v.sd
    #shouldnt direct response be outcome_2 - baseline, not fitted???
    direct_response <- outcome_2 - v.lo$fitted
    direct_response[direct_response < 0] <- 0
    direct_response <- data.frame(direct_response)
    colnames(direct_response) <- c('mapped_ns_visits')
    output_set <- cbind(output_set, direct_response)
    final_outcome <- rbind(final_outcome, output_set)
  }
  colnames(final_outcome)[1] <- c("Date_time")


  temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
  #basically left join
  t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
  t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus)) 
  for (i in 1:nrow(t_imps_plus)){
    t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
  }


  tv_data$mapped_ns_visits <- rep(0,nrow(tv_data))
  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], t_imps_plus$Date_time)
    if (is.na(num) == FALSE) {
      tv_data$mapped_ns_visits[n] <- sum(t_imps_plus$mapped_ns_visits[num:(num+minutes_gap-1)]) * tv_data$IMPRESSIONS[n]*minutes_gap / sum(t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])     
    } else {n}
  }



  final_outcome <- data.frame()

  # Loop for each day to create baseline visits and detect spikes
  for (i in 1:length(days)){
    outcome_day <- subset(visits_data, Day_Only == days[i])
    outcome_2 <- outcome_day$new_sitters
    outcome_2[which(is.na(outcome_2))] <- 0
    t <- c(1:nrow(outcome_day))
    output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
    v.lo <- loess(outcome_2 ~ t, span = span_b)
    v.sd <- sd(v.lo$residuals)
    baseline <- v.lo$fitted + sd_b * v.sd
    direct_response <- outcome_2 - baseline
    direct_response[direct_response < 0] <- 0
    direct_response <- data.frame(direct_response)
    colnames(direct_response) <- c('mapped_np_visits')
    output_set <- cbind(output_set, direct_response)
    final_outcome <- rbind(final_outcome, output_set)
  }
  colnames(final_outcome)[1] <- c("Date_time")

  #Fill in total impressions including lags 
  temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
  t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
  t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
  for (i in 1:nrow(t_imps_plus)){
    t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
  }

  #Add mapped visits data to the tv_data table and account for any overlapping spots using ratio of impressions
  tv_data$mapped_np_visits <- rep(0,nrow(tv_data))  
  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], t_imps_plus$Date_time)  # mapped new members when spot aired
    if (is.na(num) == FALSE) {
      tv_data$mapped_np_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap *
        sum(t_imps_plus$mapped_np_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])     
    } else {n}
  }

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$new_seekers[num:(num+minutes_gap-1)]
      tv_data$total_ns_visits[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$total_ns_visits[n] <- 0}  
  }

  ## Add total new provider visits ####

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$new_sitters[num:(num+minutes_gap-1)]
      tv_data$total_np_visits[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$total_np_visits[n] <- 0}
  }


  ### add total day1 prems #####

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$day1_premiums[num:(num+minutes_gap-1)]
      tv_data$day1_premiums[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$day1_premiums[n] <- 0}

  }

  # add week1 prems ###########

  for (n in 1:nrow(tv_data)){
    num <- match(tv_data[n,1], visits_data$Date_time)

    if(is.na(num) == FALSE){
      temp <- visits_data$week1_premiums[num:(num+minutes_gap-1)]
      tv_data$week1_premiums[n] <- sum(temp, na.rm = T)

    } else if(is.na(num) == TRUE){tv_data$week1_premiums[n] <- 0}

  }

  tv_data$attr_premiums <- tv_data$week1_premiums * tv_data$mapped_ns_visits / tv_data$total_ns_visits
  return(tv_data)
}


# Factory Function
TV_Attribution_Function_Factory <- function() {
list (
name = TV_Attribution_Function
,udxtype=c("scalar")
,intype = c("any")
,outtype = c("any")
)
}

Which kind of error/warning you get when you run your UDx? Also, you did code it as a SCALAR function. These UDx have in output: (1) exactly the same number of rows as the ones in input, (2) exactly one value per row. Is this your case? — mauro, Apr 12 '16 at 04:18
So scalars should have the same number of rows in the output as in the input? I have two inputs that have differing number of rows (call them A and B). The output has the same number of rows as A, but more than B. Does this mean I'd want to change the function udxtype? Is it now a transform? — ben890, Apr 12 '16 at 20:48
Also would I need to write a Outtypecallback Function for this? — ben890, Apr 12 '16 at 20:50
Yes, it should be a UDTF and yes, I guess you might want to define a outtyprcallback function (very easy and short) even if I didn't read your code here above (too long). — mauro, Apr 13 '16 at 02:26

Trying to create R UDF in Vertica

0 Answers0