0

How do i chart or extract feature importance after having gone through feature transformation such as below? Or should i have avoided hot encoding? Due to the transformation (hot encoding), i ended up having more variable importance metrics than the columns themselves.

Thanks

Feature Transformation

fea_pipeline <- ml_pipeline(sc) %>%
  ft_string_indexer(input_col = "sex", output_col = "sex_indexed") %>%
  ft_string_indexer(input_col = "drinks", output_col = "drinks_indexed") %>%
  ft_string_indexer(input_col = "drugs", output_col = "drugs_indexed") %>%
  ft_one_hot_encoder(
    input_cols = c("sex_indexed", "drinks_indexed", "drugs_indexed"),
    output_cols = c("sex_encoded", "drinks_encoded", "drugs_encoded")
  ) %>%
  ft_vector_assembler(
    input_cols = c("age", "sex_encoded", "drinks_encoded", 
                   "drugs_encoded", "essay_length"), 
    output_col = "features"
  ) %>%
  ft_standard_scaler(input_col = "features", output_col = "features_scaled", 
                     with_mean = TRUE) %>%
  ml_random_forest_classifier(features_col = "features_scaled", 
                              label_col = "not_working")

Hyper Parameter Tuning

# ------ Hyper Param Tuning ---------
grid <- list(
  random_forest = list(
    num_trees = c(5, 10),
    max_depth = c(10, 20)
  )
)

cv <- ml_cross_validator(
  sc,
  estimator = fea_pipeline,
  evaluator = ml_binary_classification_evaluator(sc, label_col = "not_working"),
  estimator_param_maps = grid,
  num_folds = 5)

cv_model <- ml_fit(cv, train_tbl)

Print the metrics

ml_validation_metrics(cv_model)

fitted <- cv_model$best_model

# ------  Variable Importance ------
ml_tree_feature_importance(ml_stage(fitted,7))

output of variable importance gives me this

0.673134090 0.023902744 0.021771300 0.015035223 0.012361712 0.016907567 0.011370478 0.007484832 0.014057235 0.013598873 0.012238969 0.178136976

I clearly have more importance values than the columns after hot encoding the categorical columns

ml_stage(fea_pipeline,5)$param_map$input_cols

as I have only these columns

[1] "age"            "sex_encoded"    "drinks_encoded" "drugs_encoded"  "essay_length" 

Scripts to run to reproduce (pre-Feature Transformation step above)

##Data Download
# download.file(
#   "https://github.com/r-spark/okcupid/raw/master/profiles.csv.zip",
#   "okcupid.zip")
# 
# unzip("okcupid.zip", exdir = "data")
# unlink("okcupid.zip")

#load library
library(sparklyr)
library(ggplot2)
library(dbplot)
library(dplyr)
library(tidyr)

# --------- processining of data--------- 
sc <- spark_connect(master = "local")

okc <- spark_read_csv(
  sc,
  "data/profiles.csv",
  escape = "\"",
  memory = FALSE,
  options = list(multiline = TRUE)
) %>%
  mutate(height = as.numeric(height),
         income = ifelse(income == "-1", NA, as.numeric(income))) %>%
  mutate(sex = ifelse(is.na(sex), "missing", sex)) %>%
  mutate(drinks = ifelse(is.na(drinks), "missing", drinks)) %>%
  mutate(drugs = ifelse(is.na(drugs), "missing", drugs)) %>%
  mutate(job = ifelse(is.na(job), "missing", job))

okc <- okc %>%
  mutate(
    not_working = ifelse(job %in% c("student", "unemployed", "retired"), 1 , 0)
  )


ethnicities <- c("asian", "middle eastern", "black", "native american", "indian", 
                 "pacific islander", "hispanic / latin", "white", "other")
ethnicity_vars <- ethnicities %>% 
  purrr::map(~ expr(ifelse(like(ethnicity, !!.x), 1, 0))) %>%
  purrr::set_names(paste0("ethnicity_", gsub("\\s|/", "", ethnicities)))

okc <- mutate(okc, !!!ethnicity_vars)

okc <- okc %>%
  mutate(
    essay_length = char_length(paste(!!!syms(paste0("essay", 0:9))))
  ) %>% 
  select(not_working, age, sex, drinks, drugs, essay1:essay9, essay_length)

# --------- pipeline--------- 

# Partition the data
partition <- 
  okc %>%
  sdf_random_split(train = 0.7, test = 0.3, seed = 1234)

# Create table references
train_tbl <- partition$train
test_tbl <- partition$test
Choc_waffles
  • 518
  • 1
  • 4
  • 15

0 Answers0