I want to try the LightGBM algorithm using tidymodels and treesnip package. Some preproccessing...
# remotes::install_github("curso-r/treesnip")
# install.packages("titanic")
library(tidymodels)
library(stringr)
library(titanic)
data("titanic_train")
df <- titanic_train %>% as_tibble %>%
mutate(title=str_extract(Name,"\\w+\\.") %>% str_replace(fixed("."),"")) %>%
mutate(title=case_when(title %in% c('Mlle','Ms')~'Miss',
title=='Mme'~ 'Mrs',
title %in% c('Capt','Don','Major','Sir','Jonkheer', 'Col')~'Sir',
title %in% c('Dona', 'Lady', 'Countess')~'Lady',
TRUE~title)) %>%
mutate(title=as.factor(title),
Survived=factor(Survived,levels = c(0,1),labels=c("no","yes")),
Sex=as.factor(Sex),
Pclass=factor(Pclass)) %>%
select(-c(PassengerId,Ticket,Cabin,Name)) %>%
mutate(Embarked=as.factor(Embarked))
table(df$title,df$Sex)
trnTst <- initial_split(data = df,prop = .8,strata = Survived)
cv.folds <- training(trnTst) %>%
vfold_cv(data = .,v = 4,repeats = 1)
cv.folds
rec <- recipe(Survived~.,data = training(trnTst)) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(Age,neighbors = 3,impute_with = vars(title,Fare,Pclass))
To check that the problem is not in the data, i succesfully tune the Random Forest algorithm.
m.rf <- rand_forest(trees = 1000,min_n = tune(),mtry = tune()) %>%
set_mode(mode = 'classification') %>%
set_engine('ranger')
wf.rf <- workflow() %>% add_recipe(rec) %>% add_model(m.rf)
(cls <- parallel::makeCluster(parallel::detectCores()-1))
doParallel::registerDoParallel(cl = cls)
tn.rf <- tune_grid(wf.rf,resamples = cv.folds,grid = 20,
metrics = metric_set(accuracy,roc_auc))
doParallel::stopImplicitCluster()
autoplot(tn.rf)
wf.rf <- finalize_workflow(x = wf.rf,parameters = select_best(tn.rf,metric = 'roc_auc'))
res.rf <- fit_resamples(wf.rf,resamples = cv.folds,metrics = metric_set(accuracy,roc_auc))
res.rf %>% collect_metrics()
But lightGBM raise error just without tuning and parallel processing
According to How to Use Lightgbm with Tidymodels
In contrast to XGBoost, both lightgbm and catboost are very capable of handling categorical variables (factors) and so you don’t need to turn variables into dummies (one hot encode), in fact you shouldn’t do it, it makes everything slower and might give you worse performance.
library(treesnip) # lightgbm & catboost connector
m.lgbm <- boost_tree() %>% #trees = tune(), min_n = tune()) %>%
set_mode(mode = 'classification') %>%
set_engine('lightgbm')
wf.lgbm <- workflow() %>% add_recipe(rec) %>% add_model(m.lgbm)
res.lgbm <- fit_resamples(wf.lgbm,resamples = cv.folds)
Warning message:
All models failed. See the `.notes` column.
res.lgbm$.notes[[1]]
internal: Error in pkg_list[[1]]: subgroup out of bounds