-1

Everything works fine, as long as I don't use factors data (my original data contains 8500 rows and more columns):

data.frame(
         p2p = c(40,69,65,99,27,34,22,24,25,54,54,
                 58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
                 67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
                 26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
                 34,28,67),
         Age = c(75,27,27,49,56,14,59,53,57,27,31,
                 52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
                 81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
                 51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
                 71,38),
    ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
                 6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
                 6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
                 6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
                 9L,6L,6L),
       class = as.factor(c("hexp","hexp","hexp",
                           "hexp","mid","mid","mid","mid","hexp","mid",
                           "mid","hexp","hexp","hexp","hexp","hexp","hexp",
                           "hexp","hexp","hexp","hexp","hexp","hexp",
                           "hexp","hexp","hexp","hexp","hexp","hexp","mid",
                           "hexp","hexp","mid","hexp","mid","mid","mid",
                           "mid","hexp","hexp","hexp","hexp","mid","mid",
                           "mid","mid","mid","mid","hexp","hexp","hexp",
                           "hexp","hexp","hexp","hexp","hexp","hexp","hexp",
                           "hexp","hexp"))
)
set.seed(1234)
fall_split <- ml_fall %>%
  initial_split(strata = p2p)

hc_train <- training(fall_split)
hc_test <- testing(fall_split)


lm_spec <- linear_reg() %>%
  set_engine(engine = "lm")
lm_spec

fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
  step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
  prep()

lm_fit <- lm_spec %>%
  fit(p2p ~ .,
    data = juice(fall_rec)
  )

If I then use:

results_train <- lm_fit %>%
  predict(new_data = hc_train)

I get the error: Fehler in eval(predvars, data, env) : Objekt 'class_hexp' nicht gefunden

I can't see my error. Unused levels are deleted, names doesn't contain '-' ...

Peter Hahn
  • 148
  • 8
  • 1
    Hi @Peter Hahn, Did it solve your issue by converting the class and changing columnames? – Quinten Jul 17 '22 at 10:02
  • Not really. If I run your example in an extra worksheet, it works. But I have more categorical variables. I don't understand why the issue happens. I build dummy variables from factors. Why does predict don't find them? – Peter Hahn Jul 17 '22 at 10:09
  • 1
    Is it possible that you share slightly more data? because now you only have one of the factors in one of your splits. – Quinten Jul 17 '22 at 10:20
  • I understand one more part. predict needs the data modified by recipe: train_proc <- bake(fall_rec, new_data = NULL) then it works. But if I apply bake to my test data the recipe doesn't work: dummy variables are not applied – Peter Hahn Jul 17 '22 at 10:41
  • Tried the same thing using workflows. The problem remains the same. The steps of the recipe are not applied to the test-data. If I use bake(fall_rec, new_data = hc_train) the result is wrong. Without processing. If I use new_data = NULL ist works. The problem is within tidymodels, processing steps are not applied. I use the newest tidymodels – Peter Hahn Jul 17 '22 at 11:15

2 Answers2

1

Finally I used workflows and removed skip = TRUE from the recipe.

library(workflows)
set.seed(1234)
fall_split <- ml_fall %>%
  initial_split(strata = p2p)

hc_train <- training(fall_split)
hc_test <- testing(fall_split)

lm_spec <- linear_reg() %>%
  set_engine(engine = "lm") %>% 
        set_mode("regression")
lm_spec


#### Recipe

fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
        step_dummy(all_nominal(), -all_outcomes()) %>% prep()
fall_rec


### Workflow

lm_wflow <- workflow() %>% 
        add_model(lm_spec) %>% 
        add_recipe(fall_rec)
lm_wflow

lm_fit <- fit(lm_wflow, data = hc_train)
lm_fit

results_train <- predict(lm_fit, new_data = hc_test) %>% 
        mutate(truth = hc_test$p2p)
Peter Hahn
  • 148
  • 8
  • Peter's solution is best. There's no real need of users to modify the `skip` argument; we've got those defaults set appropriately for the recipes steps that we provide. It's really only there for niche cases (and this is a pretty standard data set). – topepo Jul 25 '22 at 15:11
0

You should convert your "class" column to numeric and the name of the column changes in the fit to "class_mid" so you should change your column name in train to "class_mid" like this:

ml_fall <- data.frame(
  p2p = c(40,69,65,99,27,34,22,24,25,54,54,
          58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
          67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
          26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
          34,28,67),
  Age = c(75,27,27,49,56,14,59,53,57,27,31,
          52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
          81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
          51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
          71,38),
  ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
               6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
               6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
               6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
               9L,6L,6L),
  class = as.factor(c("hexp","hexp","hexp",
                      "hexp","mid","mid","mid","mid","hexp","mid",
                      "mid","hexp","hexp","hexp","hexp","hexp","hexp",
                      "hexp","hexp","hexp","hexp","hexp","hexp",
                      "hexp","hexp","hexp","hexp","hexp","hexp","mid",
                      "hexp","hexp","mid","hexp","mid","mid","mid",
                      "mid","hexp","hexp","hexp","hexp","mid","mid",
                      "mid","mid","mid","mid","hexp","hexp","hexp",
                      "hexp","hexp","hexp","hexp","hexp","hexp","hexp",
                      "hexp","hexp"))
)

library(tidymodels)
set.seed(1234)
fall_split <- ml_fall %>%
  initial_split(strata = p2p)
#> Warning: The number of observations in each quantile is below the recommended threshold of 20.
#> • Stratification will use 3 breaks instead.

hc_train <- training(fall_split)
hc_test <- testing(fall_split)

lm_spec <- linear_reg() %>%
  set_engine(engine = "lm") %>%
  set_mode("regression")
lm_spec
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
  step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
  prep() 

lm_fit <- lm_spec %>%
  fit(p2p ~ .,
      data = bake(fall_rec, new_data = NULL)
  )

# colname and numeric
colnames(hc_train) <- c("p2p", "Age", "ank_hour", "class_mid")
hc_train$class_mid <- as.numeric(hc_train$class_mid)

results_train <- lm_fit %>%
  predict(new_data = hc_train)

results_train
#> # A tibble: 45 × 1
#>    .pred
#>    <dbl>
#>  1  51.0
#>  2  49.3
#>  3  48.2
#>  4  46.0
#>  5  43.5
#>  6  48.1
#>  7  47.7
#>  8  26.3
#>  9  31.8
#> 10  37.7
#> # … with 35 more rows

Created on 2022-07-16 by the reprex package (v2.0.1)

Quinten
  • 35,235
  • 5
  • 20
  • 53