Trying to predict the probability of a binary variable being equal to 1 using tidymodels

Question

I am trying to predict the probability of two_year_recid by estimating a logit regression (with no penalty) that includes a flexible list of controls excluding decile_score and race_factor, but I keep getting an error saying

Error in eval_tidy(f[[2]], dat) : object '.' not found

this shows up on the line that starts with fit_full of the code chunk bellow

rec_full <- recipe(
  two_year_recid ~ ., 
  data = train
  ) %>% 
  step_dummy(all_nominal()) %>% 
  step_interact(~ all_predictors() * all_predictors()) %>% 
  step_poly(age, degree = 3) %>% 
  step_normalize(all_predictors()) %>% 
  step_nzv(all_predictors())

mod_lm <- logistic_reg() %>% 
  set_engine('glm')

wf_full <- workflow() %>% 
  add_recipe(rec_full) %>% 
  add_model(mod_lm)

fit_full <- wf_full %>% fit(data = train)

test <- test %>%
  select(two_year_recid) %>% 
  bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred))

The data I am using and the cleaning I did

raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")

## Main working data
df <- raw %>% 
        filter(days_b_screening_arrest <= 30) %>%
        filter(days_b_screening_arrest >= -30) %>%
        filter(is_recid != -1) %>%
        filter(c_charge_degree != "O") %>%
        filter(score_text != 'N/A')

## clean main working data a bit more
df <- df %>% 
  mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
         charge_factor = fct_explicit_na(c_charge_desc),
         race_factor = fct_explicit_na(race),
         race_factor = fct_relevel(race_factor, "Caucasian"),
         charge_factor = fct_lump_min(charge_factor, 30),
         sex_factor = factor(sex, levels = c("Female","Male")),
         priors_factor = ifelse(priors_count > 20, 20, priors_count),
         priors_factor = factor(priors_factor),
         two_year_recid = factor(two_year_recid)) %>% 
  select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor,  decile_score, length_of_stay) 

feature_names <- names(df)[-c(1,10,11)]


dfn = subset(df, select = -c(decile_score, race_factor))

set.seed(5281110)

split <- initial_split(dfn, p = 0.75)
train <- training(split)
test  <- testing(split)

And the libraries I am using

library(tidyverse)
library(tidymodels)
library(AER)

score 1 · Accepted Answer · answered Dec 09 '20 at 17:35

When you added the step step_dummy(all_nominal()), that selected your outcome two_year_recid and turned it into a dummy variable, because it is a nominal variable. Be sure to say you do not want to select it, either by adding it explicitly via -two_year_recid or by using -all_outcomes(). Then your model will fit and predict:

library(tidymodels)
library(tidyverse)

raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
#> Warning: Duplicated column names deduplicated: 'decile_score' =>
#> 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   .default = col_character(),
#>   id = col_double(),
#>   compas_screening_date = col_date(format = ""),
#>   dob = col_date(format = ""),
#>   age = col_double(),
#>   juv_fel_count = col_double(),
#>   decile_score = col_double(),
#>   juv_misd_count = col_double(),
#>   juv_other_count = col_double(),
#>   priors_count = col_double(),
#>   days_b_screening_arrest = col_double(),
#>   c_jail_in = col_datetime(format = ""),
#>   c_jail_out = col_datetime(format = ""),
#>   c_offense_date = col_date(format = ""),
#>   c_arrest_date = col_date(format = ""),
#>   c_days_from_compas = col_double(),
#>   is_recid = col_double(),
#>   r_days_from_arrest = col_double(),
#>   r_offense_date = col_date(format = ""),
#>   r_jail_in = col_date(format = ""),
#>   r_jail_out = col_date(format = "")
#>   # ... with 14 more columns
#> )
#> ℹ Use `spec()` for the full column specifications.

## Main working data
df <- raw %>% 
  filter(days_b_screening_arrest <= 30) %>%
  filter(days_b_screening_arrest >= -30) %>%
  filter(is_recid != -1) %>%
  filter(c_charge_degree != "O") %>%
  filter(score_text != 'N/A')

## clean main working data a bit more
df <- df %>% 
  mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
         charge_factor = fct_explicit_na(c_charge_desc),
         race_factor = fct_explicit_na(race),
         race_factor = fct_relevel(race_factor, "Caucasian"),
         charge_factor = fct_lump_min(charge_factor, 30),
         sex_factor = factor(sex, levels = c("Female","Male")),
         priors_factor = ifelse(priors_count > 20, 20, priors_count),
         priors_factor = factor(priors_factor),
         two_year_recid = factor(two_year_recid)) %>% 
  select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor,  decile_score, length_of_stay) 

feature_names <- names(df)[-c(1,10,11)]


dfn = subset(df, select = -c(decile_score, race_factor))

set.seed(5281110)

split <- initial_split(dfn, p = 0.75)
train <- training(split)
test  <- testing(split)

rec_full <- recipe(
  two_year_recid ~ ., 
  data = train
) %>% 
  step_dummy(all_nominal(), -two_year_recid) %>% 
  step_interact(~ all_predictors() * all_predictors()) %>% 
  step_poly(age, degree = 3) %>% 
  step_normalize(all_predictors()) %>% 
  step_nzv(all_predictors())

mod_lm <- logistic_reg() %>% 
  set_engine('glm')

wf_full <- workflow() %>% 
  add_recipe(rec_full) %>% 
  add_model(mod_lm)

fit_full <- wf_full %>% fit(data = train)

test %>%
  select(two_year_recid) %>% 
  bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred_class))
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> # A tibble: 1,543 x 2
#>    two_year_recid full 
#>    <fct>          <fct>
#>  1 1              0    
#>  2 0              0    
#>  3 0              0    
#>  4 1              1    
#>  5 1              1    
#>  6 1              1    
#>  7 1              1    
#>  8 1              0    
#>  9 0              0    
#> 10 1              0    
#> # … with 1,533 more rows

^{Created on 2020-12-09 by the reprex package (v0.3.0.9001)}

Trying to predict the probability of a binary variable being equal to 1 using tidymodels

1 Answers1