I am trying to predict the probability of two_year_recid
by estimating a logit regression (with no penalty) that includes a flexible list of controls excluding decile_score
and race_factor
, but I keep getting an error saying
Error in eval_tidy(f[[2]], dat) : object '.' not found
this shows up on the line that starts with fit_full
of the code chunk bellow
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal()) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test <- test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred))
The data I am using and the cleaning I did
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
And the libraries I am using
library(tidyverse)
library(tidymodels)
library(AER)