How do i chart or extract feature importance after having gone through feature transformation such as below? Or should i have avoided hot encoding? Due to the transformation (hot encoding), i ended up having more variable importance metrics than the columns themselves.
Thanks
Feature Transformation
fea_pipeline <- ml_pipeline(sc) %>%
ft_string_indexer(input_col = "sex", output_col = "sex_indexed") %>%
ft_string_indexer(input_col = "drinks", output_col = "drinks_indexed") %>%
ft_string_indexer(input_col = "drugs", output_col = "drugs_indexed") %>%
ft_one_hot_encoder(
input_cols = c("sex_indexed", "drinks_indexed", "drugs_indexed"),
output_cols = c("sex_encoded", "drinks_encoded", "drugs_encoded")
) %>%
ft_vector_assembler(
input_cols = c("age", "sex_encoded", "drinks_encoded",
"drugs_encoded", "essay_length"),
output_col = "features"
) %>%
ft_standard_scaler(input_col = "features", output_col = "features_scaled",
with_mean = TRUE) %>%
ml_random_forest_classifier(features_col = "features_scaled",
label_col = "not_working")
Hyper Parameter Tuning
# ------ Hyper Param Tuning ---------
grid <- list(
random_forest = list(
num_trees = c(5, 10),
max_depth = c(10, 20)
)
)
cv <- ml_cross_validator(
sc,
estimator = fea_pipeline,
evaluator = ml_binary_classification_evaluator(sc, label_col = "not_working"),
estimator_param_maps = grid,
num_folds = 5)
cv_model <- ml_fit(cv, train_tbl)
Print the metrics
ml_validation_metrics(cv_model)
fitted <- cv_model$best_model
# ------ Variable Importance ------
ml_tree_feature_importance(ml_stage(fitted,7))
output of variable importance gives me this
0.673134090 0.023902744 0.021771300 0.015035223 0.012361712 0.016907567 0.011370478 0.007484832 0.014057235 0.013598873 0.012238969 0.178136976
I clearly have more importance values than the columns after hot encoding the categorical columns
ml_stage(fea_pipeline,5)$param_map$input_cols
as I have only these columns
[1] "age" "sex_encoded" "drinks_encoded" "drugs_encoded" "essay_length"
Scripts to run to reproduce (pre-Feature Transformation step above)
##Data Download
# download.file(
# "https://github.com/r-spark/okcupid/raw/master/profiles.csv.zip",
# "okcupid.zip")
#
# unzip("okcupid.zip", exdir = "data")
# unlink("okcupid.zip")
#load library
library(sparklyr)
library(ggplot2)
library(dbplot)
library(dplyr)
library(tidyr)
# --------- processining of data---------
sc <- spark_connect(master = "local")
okc <- spark_read_csv(
sc,
"data/profiles.csv",
escape = "\"",
memory = FALSE,
options = list(multiline = TRUE)
) %>%
mutate(height = as.numeric(height),
income = ifelse(income == "-1", NA, as.numeric(income))) %>%
mutate(sex = ifelse(is.na(sex), "missing", sex)) %>%
mutate(drinks = ifelse(is.na(drinks), "missing", drinks)) %>%
mutate(drugs = ifelse(is.na(drugs), "missing", drugs)) %>%
mutate(job = ifelse(is.na(job), "missing", job))
okc <- okc %>%
mutate(
not_working = ifelse(job %in% c("student", "unemployed", "retired"), 1 , 0)
)
ethnicities <- c("asian", "middle eastern", "black", "native american", "indian",
"pacific islander", "hispanic / latin", "white", "other")
ethnicity_vars <- ethnicities %>%
purrr::map(~ expr(ifelse(like(ethnicity, !!.x), 1, 0))) %>%
purrr::set_names(paste0("ethnicity_", gsub("\\s|/", "", ethnicities)))
okc <- mutate(okc, !!!ethnicity_vars)
okc <- okc %>%
mutate(
essay_length = char_length(paste(!!!syms(paste0("essay", 0:9))))
) %>%
select(not_working, age, sex, drinks, drugs, essay1:essay9, essay_length)
# --------- pipeline---------
# Partition the data
partition <-
okc %>%
sdf_random_split(train = 0.7, test = 0.3, seed = 1234)
# Create table references
train_tbl <- partition$train
test_tbl <- partition$test