I am training an ML model on R to predict the probability of subjects being assigned to treatment (propensity score). I am using 5-fold CV, however my models tend to overfit the data, this is because treatment was assigned at the cluster level and the cross validation scheme often uses data from the same cluster to both train and validate the model.
So what I would like to achieve is a CV scheme which creates folds assigning clusters, rather than single observations to folds.
This is my current code:
learners <- c("SL.ranger","SL.xgboost","SL.nnet")
control <- SuperLearner.CV.control(V=5)
folds <- createFolds(dta$treatment_cbt,k=5)
for(f in 1:(length(folds))){
if(f == 1){
dta1 <- dta[c(folds[[5]],folds[[2]],folds[[3]],folds[[4]]),]
df_main <- dta[folds[[1]],]
}
if(f == 2){
dta1 <- dta[c(folds[[1]],folds[[5]],folds[[3]],folds[[4]]),]
df_main <- dta[folds[[2]],]
}
if(f == 3){
dta1 <- dta[c(folds[[1]],folds[[2]],folds[[5]],folds[[4]]),]
df_main <- dta[folds[[3]],]
}
if(f == 4){
dta1 <- dta[c(folds[[1]],folds[[2]],folds[[3]],folds[[5]]),]
df_main <- dta[folds[[4]],]
}
if(f == 5){
dta1 <- dta[c(folds[[1]],folds[[2]],folds[[3]],folds[[4]]),]
df_main <- dta[folds[[5]],]
}
df_aux <- dta1
df_aux[, ..covariates]
p_mod <- SuperLearner(Y = df_aux$treatment_cbt, X = df_aux[, ..covariates], newX = df_main[, ..covariates], SL.library = learners,
verbose = FALSE, method = "method.NNLS", family = binomial(),cvControl = control)
W.hat <- p_mod$SL.predict
W.hat = predict(W.forest)$predictions
#p_hat <- ifelse(p_hat<0.025, 0.025, ifelse(p_hat>.975,.975, p_hat)) # Overlap bounding
m_mod <- SuperLearner(Y = df_aux$kessler_zscore_bs, X = df_aux[, ..covariates], newX = df_main[, ..covariates], SL.library = learners,
verbose = FALSE, method = "method.NNLS",cvControl = control)
Y.hat <- m_mod$SL.predict
## Collect all nuisance parameters
pseudo_all[,1][df_main$ID] <- Y.hat
pseudo_all[,2][df_main$ID] <- W.hat
}