I have been working on getting a bootstrapped confidence interval around AUCs generated in a gradient boosted machine learning algorithm with a continuous outcome. Below is the code for one iteration of what needs to be repeated at least 200 times:
- Resample the main dataset (n=1000) with replacement
- Quantile normalize the outcome variable
- Run GBM on the resampled dataset and extract AUC
I know in SAS I could use a macro that would run through this code 200 times, creating 200 datasets and extracting 200 AUC's I could merge, but I don't know how I would loop this in R.
#calling required packages
library("WVPlots")
source('functions.R')
require(gbm)
#generating some data
main<-matrix(
replicate(52,rnorm(1132)),
ncol=52,
nrow=1132,
dimnames = list(
1:1132,
1:52)
)
colnames(main)[1] <- "PctControl"
#creating resampled dataset BS[1] (so my aim is to repeat this for BS[1:200]
BS1=matrix(
rep(as.numeric(NA)),
nrow=1000,
ncol= ncol(main),
dimnames= list(1:1000, colnames(main))
)
BS1[1:1000,]<-as.matrix (main[sample(nrow(main),size=1000,replace=TRUE),])
BS1<-BS1[order(BS1[,1]),]
Survival <- as.numeric (BS1[,1])
NormedSurvival<- as.numeric( ppoints(Survival) )
BS1<-cbind (NormedSurvival, BS1)
BS1<-as.data.frame(BS1[,-2])
#fitting the GBM model
scwrpxy.fit.gbm = gbm(NormedSurvival~.,
data=BS1,
n.trees = 50, verbose = T, shrinkage = 0.005,
bag.fraction = 0.25, # subsampling fraction, 0.5 is probably best
train.fraction = 0.3,
interaction.depth = 3, n.minobsinnode = 10, distribution = "gaussian",
cv.folds = 3)
summary(scwrpxy.fit.gbm)
#Calculating AUC
predictedGBM<-as.numeric(predict.gbm (scwrpxy.fit.gbm,
n.trees = 500,
shrinkage = 0.005,
interaction.depth=3,
bag.fraction=0.5,
train.fraction=0.3,
cv.folds = 5,
distribution="gaussian"))
observedGBM<-as.numeric(BS1$NormedSurvival)
TestTox<-as.data.frame(cbind(observedGBM,predictedGBM))
ROCPlot(TestTox,'predictedGBM','observedGBM','gbm model')
gbmAUC = as.numeric(pROC::auc(TestTox$observedGBM,TestTox$predictedGBM))
print(gbmAUC)