Tidymodels package: Model predictions to find the best model fit using the juice() and bake() functions in R

Question

Overview

I have produced four models using the tidymodels package with the data frame FID (see below):

General Linear Model
Bagged Tree
Random Forest
Boosted Trees

The data frame contains three predictors:

Year (numeric)
Month (Factor)
Days (numeric)

The dependent variable is Frequency (numeric)

Aim

My aim is to undertake model predictions to extract the class and probability values for all fitted models, which have all undergone 10 fold cross-validation.

I am attempting to use the functions prep(), juice(), and bake() in order to generate the correct data objects for model predictions objects by following this tutorial below.

Tutorial (see screenshots below)

https://meghan.rbind.io/post/tidymodels-intro/

After producing the model prediction values (i.e class and probability) for all four models, the ultimate aim is to produce confusion matrices and receiver operating curves (ROC) to evaluate all models. Therefore, I need to bind the true values from the testing data, with the class and probability columns extracted from these model predictions.

Issue

I am trying to run the predict() function to produce the class and probability values from the tutorial (see screenshots below and the link above), but I am experiencing this error message below.

Error Messages

##Class prediction object
Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"

##Prob
Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"

If anyone is able to help, I would be deeply appreciative

Many thanks in advance.

Screen-shots from the tutorial

R-code

    ##################################################
    ##Model Prediction
    ###################################################
    ##Open the tidymodels package
    library(tidymodels)
    library(tidyverse)
    library(glmnet)
    library(parsnip)
    library(rpart)
    library(tidyverse) # manipulating data
    library(skimr) # data visualization
    library(baguette) # bagged trees
    library(future) # parallel processing & decrease computation time
    library(xgboost) # boosted trees
    library(ranger)
    library(yardstick)
    library(purrr)
    library(forcats)    

###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)

###########################################################
##Produce the recipe

rec <- recipe(Frequency ~ ., data = FID) %>% 
          step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
          step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
          step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
          step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables

###########################################################
##Create Models
###########################################################

##########################################################
##General Linear Models
#########################################################

##glm
mod_glm<-linear_reg(mode="regression",
                       penalty = 0.1, 
                       mixture = 1) %>% 
                            set_engine("glmnet")

##Create workflow
wflow_glm <- workflow() %>% 
                add_recipe(rec) %>%
                      add_model(mod_glm)

##Fit the model
plan(multisession)

fit_glm <- fit_resamples(
                        wflow_glm,
                        cv,
                        metrics = metric_set(rmse, rsq),
                        control = control_resamples(save_pred = TRUE,
                              extract = function(x) extract_model(x)))

##########################################################
##Bagged Trees
##########################################################

#####Bagged Trees
mod_bag <- bag_tree() %>%
            set_mode("regression") %>%
              set_engine("rpart", times = 10) #10 bootstrap resamples
                

##Create workflow
wflow_bag <- workflow() %>% 
                   add_recipe(rec) %>%
                       add_model(mod_bag)

##Fit the model
plan(multisession)

fit_bag <- fit_resamples(
                      wflow_bag,
                      cv,
                      metrics = metric_set(rmse, rsq),
                      control = control_resamples(save_pred = TRUE,
                              extract = function(x) extract_model(x)))
###################################################
##Random forests
###################################################

mod_rf <-rand_forest(trees = 1e3) %>%
                              set_engine("ranger",
                              num.threads = parallel::detectCores(), 
                              importance = "permutation", 
                              verbose = TRUE) %>% 
                              set_mode("regression") 
                              
##Create Workflow

wflow_rf <- workflow() %>% 
               add_model(mod_rf) %>% 
                     add_recipe(rec)

##Fit the model

plan(multisession)

fit_rf<-fit_resamples(
             wflow_rf,
             cv,
             metrics = metric_set(rmse, rsq),
             control = control_resamples(save_pred = TRUE,
                                         extract = function(x) extract_model(x)))

############################################################
##Boosted Trees
############################################################

mod_boost <- boost_tree() %>% 
                 set_engine("xgboost", nthreads = parallel::detectCores()) %>% 
                      set_mode("regression")

##Create Workflow

wflow_boost <- workflow() %>% 
                  add_recipe(rec) %>% 
                    add_model(mod_boost)

##Fit model

plan(multisession)

fit_boost <-fit_resamples(
                       wflow_boost,
                       cv,
                       metrics = metric_set(rmse, rsq),
                       control = control_resamples(save_pred = TRUE,
                                         extract = function(x) extract_model(x)))

##################################################
##Prep the models for model prediction
##################################################

# Extract our prepped training data 
# and "bake" our testing data

prep<-prep(rec)

training_baked<-juice(prep)

testing_baked <- prep %>% bake(test_data) 

# Run the model with our training data
# Find the class predictions from our testing data
# And add back in the true values from testing data

predictions_class <-  %>% fit_glm %>%
                              predict(new_data = testing_baked) %>%
                                  bind_cols(testing_baked %>% dplyr::select(Frequency))

##Error message

  Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"
    
# Find the probability predictions
# And add all together

predictions_Prob <- fit_glm %>%
                        predict(testing_baked, type = "prob") %>%
                              bind_cols(predictions_class)

     ##Error message
     Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"

Data frame - FID

structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March", 
"April", "May", "June", "July", "August", "September", "October", 
"November", "December"), class = "factor"), Frequency = c(36, 
28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9, 
7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27, 
43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15, 
29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31, 
28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA, 
-36L), class = "data.frame")

score 1 · Answer 1 · answered Nov 24 '20 at 20:38

1

If your outcome or dependent variable is numeric, then you will not get out classes or probabilities from prediction; you will get out predicted values for the outcome. It isn't appropriate to make ROC curves or confusion matrices for regression problems; these only apply to classification problems.

Instead, you can make plots where you graph the true value on the x-axis and the predicted value on the y-axis as shown in this chapter.

answered Nov 24 '20 at 20:38

Julia Silge

10,848
2
40
48

Thank you for your suggestions, Julia. I am attempting to follow the tutorial you recommended (link above) to plot the RSME and R2 values from my fitted models in the R-code above. I feel very confused in regards to deconstructing or un-nesting the fitted models in my R-code in order to make model predictions using the predict() function. Do you know how? Is this possible? – Alice Hobbs Nov 26 '20 at 11:51
If you have used `fit_resamples()`, then you want to use [`collect_predictions()`](https://tune.tidymodels.org/reference/collect_predictions.html). You can do this because you fitted these models with `save_pred = TRUE`. [Here is an example showing that](https://smltar.com/mlregression.html#firstregressionevaluation). – Julia Silge Nov 29 '20 at 18:37
Hey Julia. I would like to thank you for your advice. I tried to follow the tutorial in the question above and I needed to adapt my code a little bit to produce this plot (below). I am not sure if this code is correct. Would it be possible to please ask for your opinion? – Alice Hobbs Dec 13 '20 at 14:59
I also adapted this question to extract the k-fold values using the function collect_predictions() to produce a plot of each fold. https://stackoverflow.com/questions/65274459/tidymodels-plotting-predicted-vs-true-values-using-the-functions-collect-predic – Alice Hobbs Dec 13 '20 at 15:00
My concern is that my plot below contains ten points and there are 36 rows in the data frame – Alice Hobbs Dec 13 '20 at 15:05

Alice Hobbs · Accepted Answer · 2020-12-13T15:04:09.197

Answer inspired by Julia Silge

    ##################################################
    ##Model Prediction
    ###################################################
    ##Open the tidymodels package
    library(tidymodels)
    library(tidyverse)
    library(glmnet)
    library(parsnip)
    library(rpart)
    library(tidyverse) # manipulating data
    library(skimr) # data visualization
    library(baguette) # bagged trees
    library(future) # parallel processing & decrease computation time
    library(xgboost) # boosted trees
    library(ranger)
    library(yardstick)
    library(purrr)
    library(forcats)    

###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)

###########################################################
##Produce the recipe

rec <- recipe(Frequency ~ ., data = FID) %>% 
          step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
          step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
          step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
          step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables

###########################################################
##Create Models
###########################################################

##########################################################
##General Linear Models
#########################################################

    ##############################################################################
    ############################# Model Training/Tuning ###########################
    ###############################################################################
    
    ## Define a regularized regression and explicitly leave the tuning parameters
    ## empty for later tuning.
    glm_mod_1 <- 
           parsnip::linear_reg(penalty = tune::tune(), mixture = tune::tune()) %>%
          parsnip::set_engine("glmnet")
    
    ## Construct a workflow that combines your recipe and your model
    ml_wflow <-
              workflows::workflow() %>%
                   workflows::add_recipe(rec) %>%
                        workflows::add_model(glm_mod_1)
    
    # Find best tuned model
    res <-
         ml_wflow %>%
               tune::tune_grid(resamples = cv,
                               grid = 10,
                               metrics = yardstick::metric_set(yardstick::rmse))
    
    ############################# Validation ######################################
    ###############################################################################
    
    best_params <-
             res %>%
                   tune::select_best(metric = "rmse", maximize = FALSE)
    
    #Refit using the entire training data
    reg_res <-
          ml_wflow %>%
              tune::finalize_workflow(best_params) %>%
              parsnip::fit(data = train_data)
    
    blue_test_res<-predict(reg_res, new_data=test_data %>% dplyr::select(- 
          Frequency))
    
    blue_test_res <- bind_cols(blue_test_res, test_data %>% 
                   dplyr::select(Frequency))
    
    ##Open a plotting window
    
    dev.new()
    
    ##Plot model predictions
    ggplot(blue_test_res, aes(x = Frequency, y = .pred)) + 
      # Create a diagonal line:
        geom_abline(lty = 2) + 
        geom_point(alpha = 0.5) + 
        labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
      # Scale and size the x- and y-axis uniformly:
      coord_obs_pred()

Plot

Tidymodels package: Model predictions to find the best model fit using the juice() and bake() functions in R

2 Answers2