We can use Tidymodels to tune both recipe parameters and model parameters simultaneously, right? I'm struggling to understand what corrective action I should take based on the message, Error: Some tuning parameters require finalization but there are recipe parameters that require tuning. Please use parameters()
to finalize the parameter ranges.” Any help would be most appreciated.
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(themis))
suppressPackageStartupMessages(library(finetune))
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(titanic))
registerDoParallel()
set.seed(123)
train.df <- titanic_train %>%
mutate(Survived = factor(ifelse(Survived == 1, 'Y', 'N')),
Pclass = factor(Pclass, ordered = TRUE),
Sex = factor(Sex),
Embarked = factor(ifelse(Embarked == '', NA, Embarked))) %>%
select(-c(Name, Ticket, Cabin))
summary(train.df)
#> PassengerId Survived Pclass Sex Age SibSp
#> Min. : 1.0 N:549 1:216 female:314 Min. : 0.42 Min. :0.000
#> 1st Qu.:223.5 Y:342 2:184 male :577 1st Qu.:20.12 1st Qu.:0.000
#> Median :446.0 3:491 Median :28.00 Median :0.000
#> Mean :446.0 Mean :29.70 Mean :0.523
#> 3rd Qu.:668.5 3rd Qu.:38.00 3rd Qu.:1.000
#> Max. :891.0 Max. :80.00 Max. :8.000
#> NA's :177
#> Parch Fare Embarked
#> Min. :0.0000 Min. : 0.00 C :168
#> 1st Qu.:0.0000 1st Qu.: 7.91 Q : 77
#> Median :0.0000 Median : 14.45 S :644
#> Mean :0.3816 Mean : 32.20 NA's: 2
#> 3rd Qu.:0.0000 3rd Qu.: 31.00
#> Max. :6.0000 Max. :512.33
#>
cv.folds <- vfold_cv(train.df, v = 4, strata = Survived)
cv.folds
#> # 4-fold cross-validation using stratification
#> # A tibble: 4 x 2
#> splits id
#> <list> <chr>
#> 1 <split [667/224]> Fold1
#> 2 <split [668/223]> Fold2
#> 3 <split [669/222]> Fold3
#> 4 <split [669/222]> Fold4
#########################################################
# Logistic Regression Model -- This Works
# Tuning Recipe Parameters: Yes
# Tuning Model Hyperparameters: No
recipe.logistic.regression <-
recipe(Survived ~ ., data = train.df) %>%
update_role(PassengerId, new_role = 'ID') %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_impute_knn(all_predictors(), neighbors = tune()) %>%
step_normalize(all_predictors()) %>%
step_downsample(Survived, seed = 456)
spec.logistic.regression <-
logistic_reg() %>%
set_engine("glm")
wf.logistic.regression <-
workflow() %>%
add_recipe(recipe.logistic.regression) %>%
add_model(spec.logistic.regression)
wf.logistic.regression
#> == Workflow ====================================================================
#> Preprocessor: Recipe
#> Model: logistic_reg()
#>
#> -- Preprocessor ----------------------------------------------------------------
#> 4 Recipe Steps
#>
#> * step_dummy()
#> * step_impute_knn()
#> * step_normalize()
#> * step_downsample()
#>
#> -- Model -----------------------------------------------------------------------
#> Logistic Regression Model Specification (classification)
#>
#> Computational engine: glm
rs.logistic.regression <- tune_race_anova(
wf.logistic.regression,
resamples = cv.folds,
grid = 25,
metrics = metric_set(accuracy),
control = control_race(verbose = TRUE, verbose_elim = TRUE,
parallel_over = "everything",
save_pred = TRUE,
save_workflow = TRUE)
)
#> i Racing will maximize the accuracy metric.
#> i Resamples are analyzed in a random order.
#> i Fold4: 1 eliminated; 9 candidates remain.
show_best(rs.logistic.regression)
#> # A tibble: 5 x 7
#> neighbors .metric .estimator mean n std_err .config
#> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 9 accuracy binary 0.791 4 0.0193 Preprocessor01_Model1
#> 2 2 accuracy binary 0.788 4 0.0186 Preprocessor08_Model1
#> 3 4 accuracy binary 0.788 4 0.0190 Preprocessor09_Model1
#> 4 1 accuracy binary 0.787 4 0.0205 Preprocessor05_Model1
#> 5 10 accuracy binary 0.787 4 0.0205 Preprocessor10_Model1
#########################################################
# Random Forest Model A -- This Works
# Tuning Recipe Parameters: No
# Tuning Model Hyperparameters: Yes
recipe.random.forest.a <-
recipe(Survived ~ ., data = train.df) %>%
update_role(PassengerId, new_role = 'ID') %>%
step_impute_knn(all_predictors(),
neighbors = 5) %>% # <-- Manually setting value for neighbors
step_downsample(Survived, seed = 456)
spec.random.forest.a <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = tune()) %>%
set_mode("classification") %>%
set_engine("ranger")
wf.random.forest.a <-
workflow() %>%
add_recipe(recipe.random.forest.a) %>%
add_model(spec.random.forest.a)
wf.random.forest.a
#> == Workflow ====================================================================
#> Preprocessor: Recipe
#> Model: rand_forest()
#>
#> -- Preprocessor ----------------------------------------------------------------
#> 2 Recipe Steps
#>
#> * step_impute_knn()
#> * step_downsample()
#>
#> -- Model -----------------------------------------------------------------------
#> Random Forest Model Specification (classification)
#>
#> Main Arguments:
#> mtry = tune()
#> trees = tune()
#> min_n = tune()
#>
#> Computational engine: ranger
rs.random.forest.a <- tune_race_anova(
wf.random.forest.a,
resamples = cv.folds,
grid = 25,
metrics = metric_set(accuracy),
control = control_race(verbose = TRUE, verbose_elim = TRUE,
parallel_over = "everything",
save_pred = TRUE,
save_workflow = TRUE)
)
#> i Creating pre-processing data to finalize unknown parameter: mtry
#> i Racing will maximize the accuracy metric.
#> i Resamples are analyzed in a random order.
#> i Fold4: 4 eliminated; 21 candidates remain.
show_best(rs.random.forest.a)
#> # A tibble: 5 x 9
#> mtry trees min_n .metric .estimator mean n std_err .config
#> <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 4 837 18 accuracy binary 0.818 4 0.00685 Preprocessor1_Model~
#> 2 4 1968 16 accuracy binary 0.817 4 0.00738 Preprocessor1_Model~
#> 3 4 1439 25 accuracy binary 0.817 4 0.00664 Preprocessor1_Model~
#> 4 3 1769 10 accuracy binary 0.816 4 0.0130 Preprocessor1_Model~
#> 5 3 1478 13 accuracy binary 0.816 4 0.0109 Preprocessor1_Model~
#########################################################
# Random Forest Model B -- This Does Not Work
# Tuning Recipe Parameters: Yes
# Tuning Model Hyperparameters: Yes
recipe.random.forest.b <-
recipe(Survived ~ ., data = train.df) %>%
update_role(PassengerId, new_role = 'ID') %>%
step_impute_knn(all_predictors(),
neighbors = tune()) %>% # <-- Tuning neighbors
step_downsample(Survived, seed = 456)
spec.random.forest.b <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = tune()) %>%
set_mode("classification") %>%
set_engine("ranger")
wf.random.forest.b <-
workflow() %>%
add_recipe(recipe.random.forest.b) %>%
add_model(spec.random.forest.b)
wf.random.forest.b
#> == Workflow ====================================================================
#> Preprocessor: Recipe
#> Model: rand_forest()
#>
#> -- Preprocessor ----------------------------------------------------------------
#> 2 Recipe Steps
#>
#> * step_impute_knn()
#> * step_downsample()
#>
#> -- Model -----------------------------------------------------------------------
#> Random Forest Model Specification (classification)
#>
#> Main Arguments:
#> mtry = tune()
#> trees = tune()
#> min_n = tune()
#>
#> Computational engine: ranger
rs.random.forest.b <- tune_race_anova(
wf.random.forest.b,
resamples = cv.folds,
grid = 25,
metrics = metric_set(accuracy),
control = control_race(verbose = TRUE, verbose_elim = TRUE,
parallel_over = "everything",
save_pred = TRUE,
save_workflow = TRUE)
)
#> Error: Some tuning parameters require finalization but there are recipe parameters that require tuning. Please use `parameters()` to finalize the parameter ranges.
#########################################################
sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19041)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] parallel stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] titanic_0.1.0 doParallel_1.0.16 iterators_1.0.13 foreach_1.5.1
#> [5] finetune_0.1.0 themis_0.1.4 yardstick_0.0.8 workflowsets_0.1.0
#> [9] workflows_0.2.3 tune_0.1.6 rsample_0.1.0 recipes_0.1.16
#> [13] parsnip_0.1.7 modeldata_0.1.1 infer_0.5.4 dials_0.0.9
#> [17] scales_1.1.1 broom_0.7.9 tidymodels_0.1.3 forcats_0.5.1
#> [21] stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_2.0.0
#> [25] tidyr_1.1.3 tibble_3.1.3 ggplot2_3.3.5 tidyverse_1.3.1
#>
#> loaded via a namespace (and not attached):
#> [1] minqa_1.2.4 colorspace_2.0-2 ellipsis_0.3.2 class_7.3-19
#> [5] fs_1.5.0 rstudioapi_0.13 listenv_0.8.0 furrr_0.2.3
#> [9] ParamHelpers_1.14 prodlim_2019.11.13 fansi_0.5.0 lubridate_1.7.10
#> [13] ranger_0.13.1 xml2_1.3.2 codetools_0.2-18 splines_4.1.0
#> [17] knitr_1.33 jsonlite_1.7.2 nloptr_1.2.2.2 pROC_1.17.0.1
#> [21] dbplyr_2.1.1 compiler_4.1.0 httr_1.4.2 backports_1.2.1
#> [25] assertthat_0.2.1 Matrix_1.3-4 cli_3.0.1 htmltools_0.5.1.1
#> [29] tools_4.1.0 gtable_0.3.0 glue_1.4.2 RANN_2.6.1
#> [33] parallelMap_1.5.1 fastmatch_1.1-3 Rcpp_1.0.7 cellranger_1.1.0
#> [37] styler_1.5.1 DiceDesign_1.9 vctrs_0.3.8 nlme_3.1-152
#> [41] timeDate_3043.102 mlr_2.19.0 gower_0.2.2 xfun_0.25
#> [45] globals_0.14.0 lme4_1.1-27.1 rvest_1.0.1 lifecycle_1.0.0
#> [49] future_1.21.0 MASS_7.3-54 ipred_0.9-11 hms_1.1.0
#> [53] BBmisc_1.11 yaml_2.2.1 rpart_4.1-15 stringi_1.7.3
#> [57] highr_0.9 checkmate_2.0.0 lhs_1.1.1 boot_1.3-28
#> [61] hardhat_0.1.6 lava_1.6.9 rlang_0.4.11 pkgconfig_2.0.3
#> [65] evaluate_0.14 lattice_0.20-44 tidyselect_1.1.1 parallelly_1.27.0
#> [69] plyr_1.8.6 magrittr_2.0.1 R6_2.5.0 generics_0.1.0
#> [73] DBI_1.1.1 pillar_1.6.2 haven_2.4.3 withr_2.4.2
#> [77] survival_3.2-11 nnet_7.3-16 ROSE_0.0-4 modelr_0.1.8
#> [81] crayon_1.4.1 unbalanced_2.0 utf8_1.2.2 tzdb_0.1.2
#> [85] rmarkdown_2.10 grid_4.1.0 readxl_1.3.1 data.table_1.14.0
#> [89] FNN_1.1.3 reprex_2.0.1 digest_0.6.27 munsell_0.5.0
#> [93] GPfit_1.0-8
Created on 2021-08-07 by the reprex package (v2.0.1)