So far I've been using AUC as my metric for binary classification. I've decided to give MAP-K a try.
I use lightGBM for my binary classification, I do data splitting, feature selection, hyperparameter tunning and finally return the best model with the best AUC. The first thing I do is train a simple xgboost model, then I use it for the feature selection and I proceed to use lightGBM with the filtered data. This worked perfectly with the auc
metric. After switching to MAP, my code looks like this (starting with the helper functions):
calculate the Mean Average Precision
MAPK = function(pred, true, k) {
pred = list(pred)
true = list(true)
purrr::map2(true, pred, function(a, p) {
p = p[order(p, decreasing = TRUE)]
sum(p[1:min(length(p), k)] %in% a) / min(length(p), k)
}) %>% unlist() %>% mean(na.rm = TRUE)
}
Cross validation
mapk_score_lightgbm = function(i, train_x, train_y, k, folds, tune_grid, group_info_train, group_info_test) {
mapk_cv = rep(0, k)
for (j in 1:k) {
fold_idx = folds[[j]]
dtrain <- lgb.Dataset(data = train_x[-fold_idx,], label = train_y[-fold_idx])
dtest <- lgb.Dataset(data = train_x[fold_idx,], label = train_y[fold_idx])
best_model = lgb.train(
data = dtrain,
params = list(
objective = "binary",
metric = "map_at",
learning_rate = tune_grid$learning_rate[i],
num_leaves = tune_grid$num_leaves[i],
max_depth = tune_grid$max_depth[i],
min_data_in_leaf = tune_grid$min_data_in_leaf[i],
feature_fraction = tune_grid$feature_fraction[i],
bagging_fraction = tune_grid$bagging_fraction[i],
bagging_freq = tune_grid$bagging_freq[i],
lambda_l1 = tune_grid$lambda_l1[i],
lambda_l2 = tune_grid$lambda_l2[i],
min_split_gain = tune_grid$min_split_gain[i],
max_bin = tune_grid$max_bin[i],
min_sum_hessian_in_leaf = tune_grid$min_sum_hessian_in_leaf[i],
extra_trees = tune_grid$extra_trees[i],
path_smooth = tune_grid$path_smooth[i],
boost_from_average = tune_grid$boost_from_average[i],
num_threads = 3
),
valids = list(val = dtest),
nrounds = tune_grid$nrounds[i],
early_stopping_rounds = tune_grid$early_stopping_round[i],
verbose = -1
)
test_pred <- predict(best_model, train_x[fold_idx,])
mapk_cv[j] <- MAPK(test_pred, train_y[fold_idx], k)
}
mean(mapk_cv)
}
Choose the best hyperparameters
best_model_lightgbm = function(dtrain,dtest,best_params) {
best_model <- lgb.train(
data = dtrain,
params = list(
objective = "binary",
metric = "map_at",
learning_rate = best_params$learning_rate,
num_leaves = best_params$num_leaves,
max_depth = best_params$max_depth,
min_data_in_leaf = best_params$min_data_in_leaf,
feature_fraction = best_params$feature_fraction,
bagging_fraction = best_params$bagging_fraction,
bagging_freq = best_params$bagging_freq,
lambda_l1 = best_params$lambda_l1,
lambda_l2 = best_params$lambda_l2,
min_split_gain = best_params$min_split_gain,
max_bin = best_params$max_bin,
min_sum_hessian_in_leaf = best_params$min_sum_hessian_in_leaf,
extra_trees = best_params$extra_trees,
path_smooth = best_params$path_smooth,
boost_from_average = best_params$boost_from_average,
num_threads = 3
),
valids = list(val = dtest),
nrounds = best_params$nrounds,
early_stopping_rounds = best_params$early_stopping_round,
verbose = -1
)
return(best_model)
}
And finally the function that uses all of the above:
mapkTrainModel = function(data = NULL,
trainIndex = NULL,
k.top = 25,
iteration_num = NULL,
case = c(),
using.Gene = FALSE,
algo = c()
) {
train_set <- data[trainIndex, ]
test_set <- data[-trainIndex, ]
if (case != 'Response'){
train_set = train_set[!is.na(train_set$outcome),]
test_set = test_set[!is.na(test_set$outcome),]
}
# Create the train and test sets
outcome_idx = grep("outcome", colnames(train_set))
train_x = data.matrix(train_set[, -outcome_idx])
train_y = train_set[, outcome_idx]
test_x = data.matrix(test_set[, -outcome_idx])
test_y = test_set[, outcome_idx]
# Run the initial xgb model
initial_model = xgboost(verbose=0,
data = data.matrix(train_x), label = train_y,
nrounds = 500, objective = "binary:logistic",
eval_metric = "auc",
eta = 0.001, max_depth = 6,
gamma = 3, colsample_bytree = 0.5,
min_child_weight = 5, subsample = 0.6,
lambda = 3, alpha = 3)
# Make the SHAP function that will perform feature selection
if (using.Gene == FALSE){
if(iteration_num == 2 | iteration_num == 4){
results = compute_and_filter_features(k_top = k.top, model = initial_model, train.x = train_x, test.x = test_x)
} else {
if(algo %in% c('combo','ciber')){
k_top = ifelse(algo == 'combo', 25, 20)
results = compute_and_filter_features(k_top, model = initial_model, train.x = train_x, test.x = test_x)
} else if ( algo == 'TIDE') {
k_top = 15
results = compute_and_filter_features(k_top, filter_features = TRUE, model = initial_model, train.x = train_x, test.x = test_x)
}
}
train_x_filtered = results[[1]]
test_x_filtered = results[[2]]
top_features = results[[3]]
feature_importance = results[[4]]
} else {
feature_importance = NULL
top_features = NULL
}
k = 10
folds = createFolds(train_y, k = k, list = TRUE, returnTrain = FALSE)
num_iter = 650
hyperparameters_list = list(
nrounds = c(50, 100, 200, 500, 1000, 2000, 4000, 7000, 10000, 15000, 20000),
learning_rate = c(0.005, 0.001, 0.01, 0.1),
num_leaves = c(3, 4, 5, 10, 15, 31 ,40, 63, 80, 120, 200),
max_depth = c(3, 5, 7, 9 ,12, 20, 30, 42, 50, 65),
min_data_in_leaf = c(3, 7, 10, 15, 30, 40, 50, 60),
feature_fraction = c(0.1, 0.3, 0.6, 0.9),
bagging_fraction = c(0.1, 0.3, 0.6, 0.9),
bagging_freq = c(0,2,3,5,6,8,10),
lambda_l1 = seq(0,30, 0.5),
lambda_l2 = seq(0,90),
min_split_gain = c(0.5,1,1.5,2,3,5,10,20,40),
max_bin = c(50, 100 ,200, 255, 300, 400, 700, 900),
min_sum_hessian_in_leaf = c(0.1, 0.3, 1, 3, 5, 10, 20, 30, 50),
# boosting_type = c('gbdt', 'dart', 'goss', 'rf'),
extra_trees = c(TRUE, FALSE),
path_smooth = c(0, 0.5, 1, 2, 5, 10, 15, 25, 40, 55, 72),
boost_from_average = c(TRUE, FALSE),
early_stopping_round = c(10, 20, 30, 50, 80, 100)
)
tune_grid = data.frame(matrix(ncol = length(hyperparameters_list), nrow = num_iter))
names(tune_grid) = names(hyperparameters_list)
for (name in names(hyperparameters_list)) {
tune_grid[[name]] = sample(hyperparameters_list[[name]], num_iter, replace = TRUE)
}
mapk_scores = pblapply(1:nrow(tune_grid), function(i) {
score = mapk_score_lightgbm(i, train_x_filtered, train_y, k, folds, tune_grid, )
score
})
best_params = tune_grid[which.max(unlist(mapk_scores)), ]
print(best_params)
dtrain = lgb.Dataset(data = train_x_filtered, label = train_y)
dtest = lgb.Dataset(data = test_x_filtered, label = test_y)
best_model = best_model_lightgbm(dtrain,dtest,best_params)
train_pred = predict(best_model, train_x_filtered)
train_mapk = MAPK(train_pred, train_y, k)
cat("Train MAPK:", train_mapk, "\n")
test_pred = predict(best_model, test_x_filtered)
test_mapk = MAPK(test_pred, test_y, k)
cat("Test MAPK:", test_mapk, "\n")
df = data.frame(row.names = rownames(test_x_filtered), pred = test_pred)
return(list(df = df, best_params = best_params, top_features = top_features, feature_importance = feature_importance, test_x_filtered = test_x_filtered, test_mapk = test_mapk))
}
When I run this code, I get this error:
[LightGBM] [Fatal] For MAP metric, there should be query information
Error in try({ : For MAP metric, there should be query information
Error in initialize(...) : lgb.Booster: cannot create Booster handle
EDIT : dput()
of my data
structure(list(Adipocytes = c(0, 2.13946407916669e-21, 4.66482206831665e-21,
0.000165259830460628, 0, 0.0175390316972343, 0.0438401692002902,
0, 0.0406136432387571, 0), B_cells = c(0.150541533832486, 0.0797946569490404,
0.021975112426587, 0.05624939622388, 0.129147219257496, 0.0650987336164303,
0.0164335323879098, 0.0719734321693647, 0.0652893793757585, 0.0417003944430605
), A-Cells = c(0.0267267015164665, 0.071084466432001, 0.0101699388615067,
0.15862462116585, 0.0715452135090734, 0.0214644511613815, 0.0326187125833191,
0.0935843293555624, 0.275708014267322, 0.0399108474619834), memory_T_cells = c(0.161566915845489,
0.114361971390893, 0.0592031405781612, 0.0459648417098176, 0.112233894131765,
0.046765393432833, 0.0492390514964867, 0.0609876023593976, 0.0517972815208034,
0.0342755193358782), CD4_naive_T_cells = c(0.0159712488876503,
0.0328286995571421, 0, 0.00320253390101105, 0.0210124483786409,
0.0478339500967737, 0.0425237951739848, 0.0105888253209003, 0,
0), CD4_cells = c(0.0321420494606718, 0.0318134117965785,
0.00945895040963033, 2.81537835197919e-19, 2.09076052207928e-19,
0.000293868301791991, 0, 7.59058001272299e-20, 0, 0.000524138684625728
), CD4_Tcm = c(0, 0, 5.93220475441704e-18, 0.00187255066789526,
0, 0, 0, 5.91601625741038e-20, 0, 0), CD4_Tem = c(0.0578107696879481,
0.0667327726449478, 0, 0.0113771146746561, 0.116559347300661,
0.00975426535061811, 0.00486996162848482, 0, 0, 2.13665165537782e-18
), naive_T_cells = c(0.0842015800486945, 0.0764202256157497,
0.093942286953163, 0.119105793390861, 0.103202618264043, 0.0880813895955731,
0.0940516211190602, 0.0834476895525013, 0.0790839264137833, 0.0911703797202989
), CD8_T_cells = c(0.103827495629516, 0.0394786253454611, 0.00165261519947071,
0.0410673218092668, 0.128654852317512, 0.0685858621474361, 0.0210491379453058,
0.00651140550098654, 0.0812105991379202, 0.00121516269756453)), row.names = c("Nivolumab_2017_p001_ar_8813",
"Nivolumab_2017_p002_ar_8815", "NIVO_2017_p003_ar_8878",
"Nivolumab_2017_p004_ar_8880", "NIVO_2017_p005_ar_8883",
"Nivolumab_2017_p008_ar_8914", "NIVO_2017_p009_ar_8885",
"Nivolumab_2017_p010_ar_8887", "NIVO_2017_p011_ar_8916",
"Nivolumab_2017_p017_ar_8890"), class = "data.frame")
And the target label for those samples: c(1 ,0 ,0 ,0 ,1 ,1 ,1 ,0 ,1 ,1)
If you guys need more information I'll provide it. Thanks!