I have been running bootstraps with rbinom for loops in R, but they take too long to run.
I want to perform the bootstrap on a dataset with 1,500,000 rows.
I want to resample the rows and for each of the resampled rows
- rbinom two probabilities ('prob1' & 'prob2') into 0's and 1's ('prob1_ber' & 'prob2_ber')
- add new column 'paired' with the combined outcome of step 1
- rbinom the unique combinations of the columns 'paired' and 'positive' into 0's and 1's ('prob_final')
- calculate 'pair_FPR' and 'pair_TPR'
This is what my code looks like:
library(boot)
#making example data
set.seed(1)
d2 <- data.frame(prob1=runif(n=1500000, min=1e-50, max=.9999999999),
prob2=runif(n=1500000, min=1e-44, max=.9999999989),
Positive=sample(c(0,1), replace=TRUE, size=1500000))
#making bootstrap function
function_1 <- function(data, i){
d2<-data[i,]
d2$prob1_ber <- rbinom(nrow(d2), 1, d2$prob1) #bernoulli 1 or 0
d2$prob2_ber <- rbinom(nrow(d2), 1, d2$prob2) #bernoulli 1 or 0
d2$paired <- ifelse(d2$prob1_ber == 1 & d2$prob2_ber == 1, '11',
ifelse(d2$prob1_ber == 0 & d2$prob2_ber ==0, '00',
ifelse(d2$prob1_ber == 1 & d2$prob2_ber ==0, '10',
ifelse(d2$prob1_ber == 0 & d2$prob2_ber ==1, '01', NA))))
d2$prob_final <- ifelse(d2$paired == '00',d2$prob1_ber, NA) #if both negative then negative
for (i in which(d2$paired =='11' & d2$Positive==1)) {
d2$prob_final[i] <- rbinom(1,1,0.9)
}
for (i in which(d2$paired =='11' & d2$Positive==0)) {
d2$prob_final[i] <- rbinom(1,1,0.5)
}
for (i in which(d2$paired =='01' & d2$Positive==1)) {
d2$prob_final[i] <- rbinom(1,1,0.8)
}
for (i in which(d2$paired =='01' & d2$Positive==0)) {
d2$prob_final[i] <- rbinom(1,1,0.1)
}
for (i in which(d2$paired =='10' & d2$Positive==1)) {
d2$prob_final[i] <- rbinom(1,1,0.7)
}
for (i in which(d2$paired =='10' & d2$Positive==0)) {
d2$prob_final[i] <- rbinom(1,1,0.2)
}
pair_FPR <- sum(d2[which(d2$Positive==0),]$prob_final) / nrow(d2[which(d2$Positive==0),])*100
pair_TPR <- sum(d2[which(d2$Positive==1),]$prob_final) / nrow(d2[which(d2$Positive==1),])*100
return(c(pair_FPR, pair_TPR))
}
set.seed(1)
boot_out <- boot(d2, function_1, 1000)
print(boot_out)
This bootstrap takes too long to run (n=1000). Is there a way to make it faster?
Many thanks!