So I have a long function that I run in R every month. My goal is to create a Vertica UDF using vertica's ability to run functions written in R. My hope is that this can then be automated from my companies' data warehouse. I've looked all over the internet for an example that resembles mine but cannot find one. My function takes two dataframes as input and one dataframe as output. Below is the function and factory function code.
Any help would be appreciated. Thanks, Ben
TV_Attribution_Function <- function(MAP.data, tv_data) {
# Inputs are two queries
# MAP.data = visits data
# tv_data = Data with TV spots
MAP.data$Date_time <- as.POSIXct(as.character(MAP.data$Date_time), format="%F %R")
tv_data$IMPRESSIONS <- as.numeric(tv_data$IMPRESSIONS)
tv_data$Date_time <- as.POSIXct(as.character(tv_data$Date_time), format="%F %R")
missing <- tv_data[tv_data$IMPRESSIONS <= 0,]
tv_data[which(tv_data$IMPRESSIONS == 0),'IMPRESSIONS'] <- 1
#replace nas w/ 0
tv_data[is.na(tv_data)] <- 0
MAP.data[is.na(MAP.data)] <- 0
for(i in c(1:8,10:12)){
if(class(tv_data[,i])[1] == 'character'){tv_data[,i] <- as.factor(tv_data[,i])}
}
tv_data$Feed <- factor(tv_data$Feed, levels = c("",unique(tv_data$Feed)))
tv_data$SPOT_LENGTH <- as.integer(tv_data$SPOT_LENGTH)
for(i in 2:9){
if(class(MAP.data[,i])[1] == 'character'){MAP.data[,i] <- as.factor(MAP.data[,i])}
if(typeof(MAP.data[,i])[1] == 'double'){MAP.data[,i] <- as.integer(MAP.data[,i])}
}
visits_data <- MAP.data
span = 0.2
sd_x = 1.0
span_b = 0.35
sd_b = 1.0
minutes_gap = 5
days <- unique(visits_data$Day_Only, na.rm=TRUE)
final_outcome <- data.frame()
for (i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$Visits_Count
outcome_2[which(is.na(outcome_2))] <- 0
t <- 1:nrow(outcome_day)
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_x * v.sd
direct_response <- outcome_2 - baseline
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
tv_data$mapped_visits <- rep(0,nrow(tv_data))
for (n in (1:nrow(tv_data))){
num <- match(tv_data[n,1], t_imps_plus$Date_time)
if (is.na(num) == FALSE) {
tv_data$mapped_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap * sum(t_imps_plus$mapped_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {tv_data$mapped_visits[n] <- NA}
}
#Next for visits that resulted in a seeker signup
final_outcome <- data.frame()
# Loop for each day to create baseline visits and detect spikes
for(i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$new_seekers
outcome_2[which(is.na(outcome_2))] <- 0
t <- c(1:length(outcome_2))
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span_b)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_b * v.sd
#shouldnt direct response be outcome_2 - baseline, not fitted???
direct_response <- outcome_2 - v.lo$fitted
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_ns_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
#basically left join
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
tv_data$mapped_ns_visits <- rep(0,nrow(tv_data))
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], t_imps_plus$Date_time)
if (is.na(num) == FALSE) {
tv_data$mapped_ns_visits[n] <- sum(t_imps_plus$mapped_ns_visits[num:(num+minutes_gap-1)]) * tv_data$IMPRESSIONS[n]*minutes_gap / sum(t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {n}
}
final_outcome <- data.frame()
# Loop for each day to create baseline visits and detect spikes
for (i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$new_sitters
outcome_2[which(is.na(outcome_2))] <- 0
t <- c(1:nrow(outcome_day))
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span_b)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_b * v.sd
direct_response <- outcome_2 - baseline
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_np_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
#Fill in total impressions including lags
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
#Add mapped visits data to the tv_data table and account for any overlapping spots using ratio of impressions
tv_data$mapped_np_visits <- rep(0,nrow(tv_data))
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], t_imps_plus$Date_time) # mapped new members when spot aired
if (is.na(num) == FALSE) {
tv_data$mapped_np_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap *
sum(t_imps_plus$mapped_np_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {n}
}
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$new_seekers[num:(num+minutes_gap-1)]
tv_data$total_ns_visits[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$total_ns_visits[n] <- 0}
}
## Add total new provider visits ####
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$new_sitters[num:(num+minutes_gap-1)]
tv_data$total_np_visits[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$total_np_visits[n] <- 0}
}
### add total day1 prems #####
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$day1_premiums[num:(num+minutes_gap-1)]
tv_data$day1_premiums[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$day1_premiums[n] <- 0}
}
# add week1 prems ###########
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$week1_premiums[num:(num+minutes_gap-1)]
tv_data$week1_premiums[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$week1_premiums[n] <- 0}
}
tv_data$attr_premiums <- tv_data$week1_premiums * tv_data$mapped_ns_visits / tv_data$total_ns_visits
return(tv_data)
}
# Factory Function
TV_Attribution_Function_Factory <- function() {
list (
name = TV_Attribution_Function
,udxtype=c("scalar")
,intype = c("any")
,outtype = c("any")
)
}