1

I want a better approach to this classification problem. I have four dependent variables, var1, var2, var3 and var4, each of which is either TRUE or FALSE to classify whether var5 belongs to the 0, 1, 2, 3 categories. Here is my data.

set.seed(123)
var1 <- sample(c(0,1), replace=TRUE, size=500)
df <- data.frame(var1 = sample(c(0,1), replace=TRUE, size=500),
                 var2 <- sample(c(0,1), replace=TRUE, size=500),
                 var3 <- sample(c(0,1), replace=TRUE, size=500),
                 var4 <- sample(c(0,1), replace=TRUE, size=500),
                 var5 <- sample(c(0, 1, 2, 3), replace=TRUE, size=500))

My attempt is as follows:

library(plyr)
library(readr)
library(dplyr)
library(caret)

names <- c(1, 2, 3, 4, 5)
dat[,names] <- lapply(dat[,names] , factor)
glimpse(df)

set.seed(100)
library(caTools)

spl = sample.split(df$var5, SplitRatio = 0.7)
train = subset(df, spl==TRUE)
test = subset(df, spl==FALSE)

print(dim(train)); print(dim(test))

#Build, Predict and Evaluate the Model
#To fit the logistic regression model, the first step is to instantiate the algorithm. This is done in the first line of code below with the glm() function. The second line prints the summary of the trained model.

model_glm = glm(var5 ~ . , family="binomial", data = train)
summary(model_glm)

Question

Please help me to use my data to classify for four levels of a phenomena and not binary as I did in my trial?

r2evans
  • 141,215
  • 6
  • 77
  • 149
Daniel James
  • 1,381
  • 1
  • 10
  • 28

2 Answers2

2
set.seed(123)
df <- data.frame(var1 = sample(c(0,1), replace=TRUE, size=500),
                 var2 = sample(c(0,1), replace=TRUE, size=500),
                 var3 =sample(c(0,1), replace=TRUE, size=500),
                 var4 = sample(c(0,1), replace=TRUE, size=500),
                 var5 = factor(sample(c(0, 1, 2, 3), replace=TRUE, size=500)))

set.seed(100)
library(caTools)

spl = sample.split(df$var5, SplitRatio = 0.7)
train = subset(df, spl==TRUE)
test = subset(df, spl==FALSE)

print(dim(train)); print(dim(test))

library(nnet)
# Fit the model
fit <- multinom(var5 ~ ., data = train)
Nir Graham
  • 2,567
  • 2
  • 6
  • 10
  • Thanks, your answer is well received. (1) How do I demonstrate by predicting just a few, let me say, five observations from the fitted model? (2) How do I evaluate the potency of the model? For an instance, `BIC`, `MSE`, `AIC` etc. – Daniel James Aug 15 '23 at 17:01
  • @DanielJames Try `AIC(fit)` and `BIC(fit)`. – Rui Barradas Aug 15 '23 at 17:24
  • 1
    and you can use predict as per normal `predict(fit, newdata = test)` – Nir Graham Aug 15 '23 at 17:25
2

The code below follows in great part this (not SO) post.
It fits an ordinal logistic regression model with MASSS::polr. It then computes the coefficients' confidence intervals in the odds ratio scale, predicts from the model with the test data set and runs caret::confusionMatrix on the predictions.

The stars.pval comes from this post by SO user Maël.

Note that the data is different, the ordinal response is an ordered factor, not a factor like in the question.

set.seed(123)
var1 <- sample(c(0,1), replace=TRUE, size=500)
df <- data.frame(var1 = sample(c(0,1), replace=TRUE, size=500),
                 var2 = sample(c(0,1), replace=TRUE, size=500),
                 var3 = sample(c(0,1), replace=TRUE, size=500),
                 var4 = sample(c(0,1), replace=TRUE, size=500),
                 var5 = sample(c(0, 1, 2, 3), replace=TRUE, size=500))

df[, 1:4] <- lapply(df[, 1:4] , factor)
df$var5 <- ordered(df$var5)

library(caTools)
library(MASS)

stars.pval <- function(x){
  stars <- c("***", "**", "*", "")
  var <- c(0, 0.01, 0.05, 0.10, 1)
  i <- findInterval(x, var, left.open = T, rightmost.closed = T)
  stars[i]
}

set.seed(100)
spl <- sample.split(df$var5, SplitRatio = 0.7)
train <- subset(df, spl)
test <- subset(df, !spl)

fit <- polr(var5 ~ ., train, Hess = TRUE)
AIC(fit)
#> [1] 980.9098
BIC(fit)
#> [1] 1007.915

ctable <- coef(summary(fit))
## calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2

## combined table
(ctable <- cbind(ctable, "p value" = p))
#>             Value Std. Error     t value      p value
#> var11 -0.16090820  0.1921110 -0.83757930 4.022670e-01
#> var21  0.19389705  0.1923504  1.00804088 3.134348e-01
#> var31  0.01078333  0.1932800  0.05579122 9.555081e-01
#> var41 -0.01309085  0.1931528 -0.06777458 9.459651e-01
#> 0|1   -0.94816966  0.2171773 -4.36587776 1.266133e-05
#> 1|2    0.02603120  0.2102911  0.12378653 9.014843e-01
#> 2|3    1.13948524  0.2186379  5.21174533 1.870722e-07
cbind.data.frame(ctable, ` ` = stars.pval(p)) |> print(digits = 5)
#>           Value Std. Error   t value    p value    
#> var11 -0.160908    0.19211 -0.837579 4.0227e-01    
#> var21  0.193897    0.19235  1.008041 3.1343e-01    
#> var31  0.010783    0.19328  0.055791 9.5551e-01    
#> var41 -0.013091    0.19315 -0.067775 9.4597e-01    
#> 0|1   -0.948170    0.21718 -4.365878 1.2661e-05 ***
#> 1|2    0.026031    0.21029  0.123787 9.0148e-01    
#> 2|3    1.139485    0.21864  5.211745 1.8707e-07 ***

ci <- confint(fit)
#> Waiting for profiling to be done...
exp(cbind(OR = coef(fit), ci))
#>              OR     2.5 %   97.5 %
#> var11 0.8513702 0.5838532 1.240404
#> var21 1.2139713 0.8328939 1.771160
#> var31 1.0108417 0.6919196 1.476772
#> var41 0.9869945 0.6757277 1.441490

var5_pred <- predict(fit, newdata = test)
caret::confusionMatrix(test$var5, var5_pred)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  0  1  2  3
#>          0 33  0  0  9
#>          1 31  0  0  3
#>          2 29  0  0  8
#>          3 27  0  0 10
#> 
#> Overall Statistics
#>                                           
#>                Accuracy : 0.2867          
#>                  95% CI : (0.2159, 0.3661)
#>     No Information Rate : 0.8             
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.0183          
#>                                           
#>  Mcnemar's Test P-Value : NA              
#> 
#> Statistics by Class:
#> 
#>                      Class: 0 Class: 1 Class: 2 Class: 3
#> Sensitivity            0.2750       NA       NA  0.33333
#> Specificity            0.7000   0.7733   0.7533  0.77500
#> Pos Pred Value         0.7857       NA       NA  0.27027
#> Neg Pred Value         0.1944       NA       NA  0.82301
#> Prevalence             0.8000   0.0000   0.0000  0.20000
#> Detection Rate         0.2200   0.0000   0.0000  0.06667
#> Detection Prevalence   0.2800   0.2267   0.2467  0.24667
#> Balanced Accuracy      0.4875       NA       NA  0.55417

Created on 2023-08-15 with reprex v2.0.2

Rui Barradas
  • 70,273
  • 8
  • 34
  • 66
  • Error when I ran this line: `test <- subset(df, !spl)` `Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) : invalid first argument Error in assign(cacheKey, frame, .rs.CachedDataEnv) : attempt to use zero-length variable name` – Daniel James Aug 15 '23 at 20:22
  • @DanielJames That's a RStudio problem, not the code's. Try terminating and restarting the R session. – Rui Barradas Aug 15 '23 at 21:21
  • @DanielJames A logical variable does not need an explicit comparison to `FALSE/TRUE` since it already is a logical value, that's why I wrote `test <- subset(df, !spl)`, by negating the false values become true and those rows are chosen by `subset`. – Rui Barradas Aug 15 '23 at 21:23