Hello my dataset looks like this:
structure(list(pa = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
lon = c(26.953632, 26.914444, 26.854655, 26.377477, 26.653273,
26.739085, 26.732233, 26.67895, 26.6691, 26.925116, 26.771316,
26.952233, 26.934466, 26.9493, 26.948333), lat = c(37.65571,
37.658056, 37.548262, 37.714353, 37.670897, 37.652183, 37.664717,
37.672083, 37.6934, 37.63755, 37.41155, 37.65095, 37.661533,
37.65825, 37.652166), distance = c(2664.205501, 2188.408657,
1309.509802, 2931.223857, 443.7116677, 83.4248179, 1162.349952,
1025.302461, 1447.284772, 156.3081952, 1718.49796, 2120.230705,
2940.015299, 2859.658249, 2179.706853), N = c(2L, 3L, 3L,
4L, 1L, 3L, 3L, 4L, 8L, 7L, 2L, 0L, 10L, 0L, 0L), nh4 = c(0.0911071189102672,
0.0912837530530634, 0.0887604283967188, 0.0809833919295647,
0.0806452852518153, 0.0873989977309376, 0.0854938036251452,
0.0837840217003991, 0.113291559368372, 0.139553981108798,
0.136305334431029, 0.149872598116116, 0.14975582563108, 0.149872598116116,
0.149872598116116), ppn = c(3.13649814951996, 3.38222779366539,
2.5790228332411, 1.68392748415672, 2.80087243875361, 3.2346900728285,
3.17393288172866, 2.63412894585215, 3.14572940860351, 4.80038520203728,
5.83457531216185, 5.10820325640801, 5.14342739916075, 5.10820325640801,
5.10820325640801)), row.names = c(1L, 2L, 3L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 13L, 16L, 17L, 18L, 19L), class = "data.frame")
I'm trying to fit a model with this kind of formula:
mod <- mgcv::gam(data=db, family=binomial(link="logit"), method="REML",
cbind(pa, N) ~ s(lon) + s(lat) + ti(lon, lat, distance, bs = "re") +
s(nh4) + s(ppn, k = 10) )
Where pa is a binomial variable (presence/absence) and N is the number of individuals collected (when when presence has value 1). The problem is when I run the following code to calculate the AUC, R returns errors:
library(mgcv) # library for GAM
library(ggplot2) # for beautiful plots
library(cdata) # data wrangling
library(sigr) # AUC calculation
data <- dplyr::select(db, pa, lon, lat, distance, nh4, ppn, N, season)
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]
performance=function(y,pred){
confmat_test=table(truth=y,predict=pred>0.5)
acc=sum(diag(confmat_test))/sum(confmat_test)
precision=confmat_test[2,2]/sum(confmat_test[,2])
recall=confmat_test[2,2]/sum(confmat_test[2,])
auc=calcAUC(pred,y)
c(acc,precision,recall,auc)
}
# Posterior probability
train$pred=predict(gam_model,newdata = train,type = "response")
test$pred=predict(gam_model,newdata=test,type="response")
# model performance evaluated using training data
perf_train=performance(train$pa_dd,train$pred)
perf_test=performance(test$pa_dd,test$pred)
perf_mat=rbind(perf_train,perf_test)
colnames(perf_mat)=c("accuracy","precision","recall","AUC")
round(perf_mat,4)
Questions are: Is this formula correct? How can I compute AUC? How can I compute each variable's importance?
Thank you in advance.