1

So far I've been using AUC as my metric for binary classification. I've decided to give MAP-K a try.

I use lightGBM for my binary classification, I do data splitting, feature selection, hyperparameter tunning and finally return the best model with the best AUC. The first thing I do is train a simple xgboost model, then I use it for the feature selection and I proceed to use lightGBM with the filtered data. This worked perfectly with the auc metric. After switching to MAP, my code looks like this (starting with the helper functions):

calculate the Mean Average Precision
MAPK = function(pred, true, k) {
  pred = list(pred)
  true = list(true)
  purrr::map2(true, pred, function(a, p) {
    p = p[order(p, decreasing = TRUE)]
    sum(p[1:min(length(p), k)] %in% a) / min(length(p), k)
  }) %>% unlist() %>% mean(na.rm = TRUE)
}
Cross validation
mapk_score_lightgbm = function(i, train_x, train_y, k, folds, tune_grid, group_info_train, group_info_test) {
  mapk_cv = rep(0, k)
  for (j in 1:k) {
    fold_idx = folds[[j]]
    
    dtrain <- lgb.Dataset(data = train_x[-fold_idx,], label = train_y[-fold_idx])        
    dtest <- lgb.Dataset(data = train_x[fold_idx,], label = train_y[fold_idx])
    
    best_model = lgb.train(
      data = dtrain,
      params = list(
        objective = "binary",
        metric = "map_at", 
        learning_rate = tune_grid$learning_rate[i],
        num_leaves = tune_grid$num_leaves[i],
        max_depth = tune_grid$max_depth[i],
        min_data_in_leaf = tune_grid$min_data_in_leaf[i],
        feature_fraction = tune_grid$feature_fraction[i],
        bagging_fraction = tune_grid$bagging_fraction[i],
        bagging_freq = tune_grid$bagging_freq[i],
        lambda_l1 = tune_grid$lambda_l1[i],
        lambda_l2 = tune_grid$lambda_l2[i],
        min_split_gain = tune_grid$min_split_gain[i],
        max_bin = tune_grid$max_bin[i],
        min_sum_hessian_in_leaf = tune_grid$min_sum_hessian_in_leaf[i],
        extra_trees = tune_grid$extra_trees[i],
        path_smooth = tune_grid$path_smooth[i],
        boost_from_average = tune_grid$boost_from_average[i],
        num_threads = 3
      ),
      valids = list(val = dtest),
      nrounds = tune_grid$nrounds[i],
      early_stopping_rounds = tune_grid$early_stopping_round[i],
      verbose = -1
    )
    
    test_pred <- predict(best_model, train_x[fold_idx,])
    mapk_cv[j] <- MAPK(test_pred, train_y[fold_idx], k)
  }
  mean(mapk_cv)
}
Choose the best hyperparameters
best_model_lightgbm = function(dtrain,dtest,best_params) {
  best_model <- lgb.train(
    data = dtrain,
    params = list(
      objective = "binary",
      metric = "map_at", 
      learning_rate = best_params$learning_rate,
      num_leaves = best_params$num_leaves,
      max_depth = best_params$max_depth,
      min_data_in_leaf = best_params$min_data_in_leaf,
      feature_fraction = best_params$feature_fraction,
      bagging_fraction = best_params$bagging_fraction,
      bagging_freq = best_params$bagging_freq,
      lambda_l1 = best_params$lambda_l1,
      lambda_l2 = best_params$lambda_l2,
      min_split_gain = best_params$min_split_gain,
      max_bin = best_params$max_bin,
      min_sum_hessian_in_leaf = best_params$min_sum_hessian_in_leaf,
      extra_trees = best_params$extra_trees,
      path_smooth = best_params$path_smooth,
      boost_from_average = best_params$boost_from_average,
      num_threads = 3
    ),
    valids = list(val = dtest),
    nrounds = best_params$nrounds,
    early_stopping_rounds = best_params$early_stopping_round,
    verbose = -1
  )
  return(best_model)
}
And finally the function that uses all of the above:
mapkTrainModel = function(data = NULL,
                          trainIndex = NULL,
                          k.top = 25,
                          iteration_num = NULL,
                          case = c(),
                          using.Gene = FALSE,
                          algo = c()
) {
  
  train_set <- data[trainIndex, ]
  test_set <- data[-trainIndex, ]
    
  if (case != 'Response'){
    
    train_set = train_set[!is.na(train_set$outcome),]
    test_set = test_set[!is.na(test_set$outcome),]
    
  }
  
  # Create the train and test sets
  
  outcome_idx = grep("outcome", colnames(train_set))
  train_x = data.matrix(train_set[, -outcome_idx])
  train_y = train_set[, outcome_idx]
  test_x = data.matrix(test_set[, -outcome_idx])
  test_y = test_set[, outcome_idx]
  
  # Run the initial xgb model
  
  initial_model = xgboost(verbose=0,
                          data = data.matrix(train_x), label = train_y,
                          nrounds = 500, objective = "binary:logistic",
                          eval_metric = "auc",
                          eta = 0.001, max_depth = 6,
                          gamma = 3, colsample_bytree = 0.5,
                          min_child_weight = 5, subsample = 0.6,
                          lambda = 3, alpha = 3)
  
  
  # Make the SHAP function that will perform feature selection
  
  if (using.Gene == FALSE){
    
    if(iteration_num == 2 | iteration_num == 4){
      results = compute_and_filter_features(k_top = k.top, model = initial_model, train.x = train_x, test.x = test_x)
    } else {
      
      if(algo %in% c('combo','ciber')){
        k_top = ifelse(algo == 'combo', 25, 20)
        results = compute_and_filter_features(k_top, model = initial_model, train.x = train_x, test.x = test_x)
      } else if ( algo == 'TIDE') {
        
        k_top = 15 
        results = compute_and_filter_features(k_top, filter_features = TRUE, model = initial_model, train.x = train_x, test.x = test_x)
      } 
    }
    train_x_filtered = results[[1]]
    test_x_filtered = results[[2]]
    top_features = results[[3]]
    feature_importance = results[[4]]
    
  } else {
    feature_importance = NULL
    top_features = NULL
  }
    
  k = 10
  folds = createFolds(train_y, k = k, list = TRUE, returnTrain = FALSE)
  
  num_iter = 650
  hyperparameters_list = list(
    nrounds = c(50, 100, 200, 500, 1000, 2000, 4000, 7000, 10000, 15000, 20000),
    learning_rate = c(0.005, 0.001, 0.01, 0.1),
    num_leaves = c(3, 4, 5, 10, 15, 31 ,40, 63, 80, 120, 200),
    max_depth = c(3, 5, 7, 9 ,12, 20, 30, 42, 50, 65),
    min_data_in_leaf = c(3, 7, 10, 15, 30, 40, 50, 60),
    feature_fraction = c(0.1, 0.3, 0.6, 0.9),
    bagging_fraction = c(0.1, 0.3, 0.6, 0.9),
    bagging_freq = c(0,2,3,5,6,8,10),
    lambda_l1 = seq(0,30, 0.5),
    lambda_l2 = seq(0,90),
    min_split_gain = c(0.5,1,1.5,2,3,5,10,20,40),
    max_bin = c(50, 100 ,200, 255, 300, 400, 700, 900),
    min_sum_hessian_in_leaf = c(0.1, 0.3, 1, 3, 5, 10, 20, 30, 50),
    # boosting_type = c('gbdt', 'dart', 'goss', 'rf'),
    extra_trees = c(TRUE, FALSE),
    path_smooth = c(0, 0.5, 1, 2, 5, 10, 15, 25, 40, 55, 72),
    boost_from_average = c(TRUE, FALSE),
    early_stopping_round = c(10, 20, 30, 50, 80, 100)
  )
  
  tune_grid = data.frame(matrix(ncol = length(hyperparameters_list), nrow = num_iter))
  names(tune_grid) = names(hyperparameters_list)
  
  for (name in names(hyperparameters_list)) {
    tune_grid[[name]] = sample(hyperparameters_list[[name]], num_iter, replace = TRUE)
  }
  
  mapk_scores = pblapply(1:nrow(tune_grid), function(i) {
    score = mapk_score_lightgbm(i, train_x_filtered, train_y, k, folds, tune_grid, )
    score
  })
  
  best_params = tune_grid[which.max(unlist(mapk_scores)), ]
  print(best_params)
  
  dtrain = lgb.Dataset(data = train_x_filtered, label = train_y)      
  dtest = lgb.Dataset(data = test_x_filtered, label = test_y)
  
  best_model = best_model_lightgbm(dtrain,dtest,best_params)
  
  train_pred = predict(best_model, train_x_filtered)
  train_mapk = MAPK(train_pred, train_y, k)
  cat("Train MAPK:", train_mapk, "\n")
  
  test_pred = predict(best_model, test_x_filtered)
  test_mapk = MAPK(test_pred, test_y, k)
  cat("Test MAPK:", test_mapk, "\n")
   
  df = data.frame(row.names = rownames(test_x_filtered), pred = test_pred)
  return(list(df = df, best_params = best_params, top_features = top_features, feature_importance = feature_importance, test_x_filtered = test_x_filtered, test_mapk = test_mapk))
  
}

When I run this code, I get this error:

 [LightGBM] [Fatal] For MAP metric, there should be query information
Error in try({ : For MAP metric, there should be query information

Error in initialize(...) : lgb.Booster: cannot create Booster handle

EDIT : dput() of my data

structure(list(Adipocytes = c(0, 2.13946407916669e-21, 4.66482206831665e-21, 
0.000165259830460628, 0, 0.0175390316972343, 0.0438401692002902, 
0, 0.0406136432387571, 0), B_cells = c(0.150541533832486, 0.0797946569490404, 
0.021975112426587, 0.05624939622388, 0.129147219257496, 0.0650987336164303, 
0.0164335323879098, 0.0719734321693647, 0.0652893793757585, 0.0417003944430605
), A-Cells = c(0.0267267015164665, 0.071084466432001, 0.0101699388615067, 
0.15862462116585, 0.0715452135090734, 0.0214644511613815, 0.0326187125833191, 
0.0935843293555624, 0.275708014267322, 0.0399108474619834), memory_T_cells = c(0.161566915845489, 
0.114361971390893, 0.0592031405781612, 0.0459648417098176, 0.112233894131765, 
0.046765393432833, 0.0492390514964867, 0.0609876023593976, 0.0517972815208034, 
0.0342755193358782), CD4_naive_T_cells = c(0.0159712488876503, 
0.0328286995571421, 0, 0.00320253390101105, 0.0210124483786409, 
0.0478339500967737, 0.0425237951739848, 0.0105888253209003, 0, 
0), CD4_cells = c(0.0321420494606718, 0.0318134117965785, 
0.00945895040963033, 2.81537835197919e-19, 2.09076052207928e-19, 
0.000293868301791991, 0, 7.59058001272299e-20, 0, 0.000524138684625728
), CD4_Tcm = c(0, 0, 5.93220475441704e-18, 0.00187255066789526, 
0, 0, 0, 5.91601625741038e-20, 0, 0), CD4_Tem = c(0.0578107696879481, 
0.0667327726449478, 0, 0.0113771146746561, 0.116559347300661, 
0.00975426535061811, 0.00486996162848482, 0, 0, 2.13665165537782e-18
), naive_T_cells = c(0.0842015800486945, 0.0764202256157497, 
0.093942286953163, 0.119105793390861, 0.103202618264043, 0.0880813895955731, 
0.0940516211190602, 0.0834476895525013, 0.0790839264137833, 0.0911703797202989
), CD8_T_cells = c(0.103827495629516, 0.0394786253454611, 0.00165261519947071, 
0.0410673218092668, 0.128654852317512, 0.0685858621474361, 0.0210491379453058, 
0.00651140550098654, 0.0812105991379202, 0.00121516269756453)), row.names = c("Nivolumab_2017_p001_ar_8813", 
"Nivolumab_2017_p002_ar_8815", "NIVO_2017_p003_ar_8878", 
"Nivolumab_2017_p004_ar_8880", "NIVO_2017_p005_ar_8883", 
"Nivolumab_2017_p008_ar_8914", "NIVO_2017_p009_ar_8885", 
"Nivolumab_2017_p010_ar_8887", "NIVO_2017_p011_ar_8916", 
"Nivolumab_2017_p017_ar_8890"), class = "data.frame")

And the target label for those samples: c(1 ,0 ,0 ,0 ,1 ,1 ,1 ,0 ,1 ,1)

If you guys need more information I'll provide it. Thanks!

Programming Noob
  • 1,232
  • 3
  • 14
  • Did you read [that]([https://stackoverflow.com/questions/53104070/what-is-lightgbms-query-information-concept-for-map-metric) – MrSmithGoesToWashington Jul 26 '23 at 12:35
  • I did but I did not understand it entirely. Basically I can't just group the data according to the label of each sample, right? For example here I'm predicting `response` vs `noresponse`, grouping the data according to this is wrong, I should group the samples according to something else, like cancer type or type of treatment or something like that, correct? – Programming Noob Jul 26 '23 at 12:42
  • also, in the data itself, I should group the samples together ? for example in the first 50 rows there should be the first cancer type, the next 100 rows would be cancer type 2, and so on ... am I right ? @MrSmithGoesToWashington – Programming Noob Jul 26 '23 at 12:43
  • 2
    Please provide a [mre] (emphasis on *minimal*). There is no indication that parts like cross-validation and SHAP feature selection are really necessary to demonstrate and reproduce the error. – desertnaut Jul 26 '23 at 12:47
  • @desertnaut I apologize for providing unnecessary stuff. I removed the function that performs feature selection since the error is not there indeed, I just thought you might want to try and run the function. Anyway, I added 10 samples from the data with some features, and also with the labels of those samples. The problem is that the metric is asking me to group or order the samples in someway, I just don't understand how. – Programming Noob Jul 26 '23 at 15:35

1 Answers1

0

It seems that map is for classes ... or groups ... or sets ... because the mean is a mean over classes ... so I am not sure that it is suitable for binary classification. Anyway, if I understand it well (but I probably don't), you can use it in the following way: according to the explanatory variables you can group your data ... for each group the model should predict the same result ... but the truth is that the result is not the same for each observation of a group ... so you can compute the precision on that group ... and so on for each group ... and then a mean of all the precisions ... So grouping by explanatory variables ... but if you have continuous variables, or discrete variables with too many classes, you need to define intervals or classes for those variables in order to group your observations.