0

Hello my dataset looks like this:

structure(list(pa = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 
1L, 2L, 1L, 1L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
    lon = c(26.953632, 26.914444, 26.854655, 26.377477, 26.653273, 
    26.739085, 26.732233, 26.67895, 26.6691, 26.925116, 26.771316, 
    26.952233, 26.934466, 26.9493, 26.948333), lat = c(37.65571, 
    37.658056, 37.548262, 37.714353, 37.670897, 37.652183, 37.664717, 
    37.672083, 37.6934, 37.63755, 37.41155, 37.65095, 37.661533, 
    37.65825, 37.652166), distance = c(2664.205501, 2188.408657, 
    1309.509802, 2931.223857, 443.7116677, 83.4248179, 1162.349952, 
    1025.302461, 1447.284772, 156.3081952, 1718.49796, 2120.230705, 
    2940.015299, 2859.658249, 2179.706853), N = c(2L, 3L, 3L, 
    4L, 1L, 3L, 3L, 4L, 8L, 7L, 2L, 0L, 10L, 0L, 0L), nh4 = c(0.0911071189102672, 
    0.0912837530530634, 0.0887604283967188, 0.0809833919295647, 
    0.0806452852518153, 0.0873989977309376, 0.0854938036251452, 
    0.0837840217003991, 0.113291559368372, 0.139553981108798, 
    0.136305334431029, 0.149872598116116, 0.14975582563108, 0.149872598116116, 
    0.149872598116116), ppn = c(3.13649814951996, 3.38222779366539, 
    2.5790228332411, 1.68392748415672, 2.80087243875361, 3.2346900728285, 
    3.17393288172866, 2.63412894585215, 3.14572940860351, 4.80038520203728, 
    5.83457531216185, 5.10820325640801, 5.14342739916075, 5.10820325640801, 
    5.10820325640801)), row.names = c(1L, 2L, 3L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 13L, 16L, 17L, 18L, 19L), class = "data.frame")

I'm trying to fit a model with this kind of formula:

mod <- mgcv::gam(data=db, family=binomial(link="logit"), method="REML",
           cbind(pa, N) ~ s(lon) + s(lat) + ti(lon, lat, distance, bs = "re") +
             s(nh4)  + s(ppn, k = 10) )

Where pa is a binomial variable (presence/absence) and N is the number of individuals collected (when when presence has value 1). The problem is when I run the following code to calculate the AUC, R returns errors:

library(mgcv)   # library for GAM
library(ggplot2)  # for beautiful plots
library(cdata)    # data wrangling
library(sigr)     # AUC calculation

data <- dplyr::select(db, pa, lon, lat, distance, nh4, ppn, N, season) 
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]

performance=function(y,pred){
  confmat_test=table(truth=y,predict=pred>0.5)
  acc=sum(diag(confmat_test))/sum(confmat_test)
  precision=confmat_test[2,2]/sum(confmat_test[,2])
  recall=confmat_test[2,2]/sum(confmat_test[2,])
  auc=calcAUC(pred,y)
  c(acc,precision,recall,auc)
}
# Posterior probability
train$pred=predict(gam_model,newdata = train,type = "response")
test$pred=predict(gam_model,newdata=test,type="response")

# model performance evaluated using training data
perf_train=performance(train$pa_dd,train$pred)
perf_test=performance(test$pa_dd,test$pred)
perf_mat=rbind(perf_train,perf_test)
colnames(perf_mat)=c("accuracy","precision","recall","AUC")
round(perf_mat,4)

Questions are: Is this formula correct? How can I compute AUC? How can I compute each variable's importance?

Thank you in advance.

Shawn Hemelstrand
  • 2,676
  • 4
  • 17
  • 30
Mauri21
  • 11
  • 3
  • Greetings! Usually it is helpful to provide a minimally reproducible dataset for questions here so people can troubleshoot your problems. One way of doing this is by using the `dput` function. You can find out how to use it here: https://youtu.be/3EID3P1oisg – Shawn Hemelstrand Nov 16 '22 at 05:32
  • hi @ShawnHemelstrand I did it – Mauri21 Nov 21 '22 at 16:13
  • Dunno if I know how to help but maybe this may be helpful https://stackoverflow.com/questions/23774091/calculate-auc-and-gam-and-set-a-scale-in-r – Shawn Hemelstrand Nov 21 '22 at 19:51

0 Answers0