I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role()
doesn't do that...
data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(vital ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(strata_var, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_smote(vital)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05,
0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
In the last plot I can see the strata variable coming up.