# Partition the data:
library(tidymodels)
set.seed(1234)
uni_split <- initial_split(suspicious_match, strata = truth)
uni_train <- training(uni_split)
uni_test <- testing(uni_split)
uni_split
## Build a model recipe :
library(themis)
uni_rec <- recipe(truth ~ lv + lcs + qgram + jaccard + jw + cosine , data = uni_train)%>%
step_normalize(all_numeric()) %>%
step_smote(truth, skip = FALSE)%>%
prep()
uni_rec
bake(uni_rec, new_data = uni_train)
i trained the data with multiple models:(an example)
# Train Logistic Regression :
glm_spec <- logistic_reg()%>%
set_engine("glm")
glm_fit <- glm_spec %>%
fit(truth ~ lv + lcs + qgram + cosine + jaccard + jw , data= juice(uni_rec))
glm_fit
## Model evaluation with resampling :
set.seed(123)
folds <- vfold_cv(juice(uni_rec), strata = truth)
folds
#1: Logistic Reg:
set.seed(234)
glm_rs <- glm_spec%>%
fit_resamples(truth ~ lv + lcs + qgram + cosine + jaccard + jw, folds,
metrics = metric_set(roc_auc, sens, spec, accuracy),
control = control_resamples(save_pred = TRUE))
## Evaluation des modeles :
glm_rs %>% collect_metrics()
> glm_rs %>% collect_metrics()
# A tibble: 4 x 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.851 10 0.00514 Preprocessor1_Model1
2 roc_auc binary 0.898 10 0.00390 Preprocessor1_Model1
3 sens binary 0.875 10 0.00695 Preprocessor1_Model1
4 spec binary 0.827 10 0.00700 Preprocessor1_Model1
but then when i try applying the logistic regression model to the test data i get this error:
> glm_fit %>%
+ predict(new_data = bake(uni_rec, new_data = uni_test),
+ type = "prob")%>%
+ mutate(truth = uni_test$truth)%>%
+ roc_auc(truth, .pred_correct)
Erreur : Problem with `mutate()` input `truth`.
x Input `truth` can't be recycled to size 2022.
i Input `truth` is `uni_test$truth`.
i Input `truth` must be size 2022 or 1, not 1373.
Run `rlang::last_error()` to see where the error occurred.
i figured it's because of the smote step in the recipe but i can't figure out how to fix it please help !!