I have artificially increased the imbalance ratio to show the impact of different popular scoring metrics on the classification performance. Also, I have artificially added some missing values to see that my pipe line is working properly. However, I see a pretty low values for Matthews correlation coefficient and G-mean whereas ROC-AUC, and average precision, and weighted F1 are pretty high. These performance measures are very popular to evaluate imbalanced data classification problems. I think ROC-AUC, average precision, and weighted F1 failed to quantify the class imbalance issue. I am curious to know the possible explanations for this! I am not sure which metric I should report. I am mostly interested in positive(minority) cases here!
from sklearn.metrics import make_scorer
from sklearn.datasets import load_breast_cancer
from imblearn.metrics import geometric_mean_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
X, y = load_breast_cancer(return_X_y=True)
EXT_IMB_RATE = 0.025
#RANDOMLY UNDERSAMPLE DATA SET TO MAKE IT HIGHLY IMBALANCED
minIdx = np.where(y==0)[0]
majIdx = np.where(y==1)[0]
ssMinIdx = np.random.choice(minIdx, int(np.round((212+357)*EXT_IMB_RATE)))
# print(len(ssMinIdx))
y_ExtImb = np.append(y[majIdx], y[ssMinIdx])
# print(len(yExtImb))
X_ExtImb = np.concatenate((X[majIdx], X[ssMinIdx]),axis=0)
print(np.bincount(y_ExtImb))
rng = np.random.RandomState(42)
def add_missing_values(X_full, y_full):
n_samples, n_features = X_full.shape
# Add missing values in 75% of the lines
missing_rate = 0.25
n_missing_samples = int(n_samples * missing_rate)
missing_samples = np.zeros(n_samples, dtype=bool)
missing_samples[: n_missing_samples] = True
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)
X_missing = X_full.copy()
X_missing[missing_samples, missing_features] = np.nan
y_missing = y_full.copy()
return X_missing, y_missing
X_miss, y_miss = add_missing_values(X_ExtImb, y_ExtImb)
np.count_nonzero(np.isnan(X_miss))
LR_pipe = Pipeline([("impute", SimpleImputer(strategy='constant',fill_value= 0)),("scale", MaxAbsScaler()),("SVC", SVC())])
gmean = make_scorer(geometric_mean_score, greater_is_better=True)
MCC = make_scorer(matthews_corrcoef, greater_is_better=True)
scores = cross_validate(LR_pipe,X_miss, y_miss, cv=5,scoring={'G-mean': gmean,'F1':'f1_weighted', 'MCC': MCC, 'AUC': 'roc_auc', 'Avg_Precision': 'average_precision'}
)
sorted(scores.keys())
SVC_Gmean = scores['test_G-mean'].mean()
SVC_MCC = scores['test_MCC'].mean()
SVC_AUC = scores['test_AUC'].mean()
SVC_precision = scores['test_Avg_Precision'].mean()
SVC_F1 = scores['test_F1'].mean()
print("MCC: %f" % (SVC_MCC))
print("G-mean: %f" % (SVC_Gmean))
print("F1 score: %f" % (SVC_F1 ))
print("AUC: %f" % (SVC_AUC))
print("Average Precision: %f" % (SVC_precision))
My results:
MCC: 0.552093
G-mean: 0.557539
F1 score: 0.972603
AUC: 0.985915
Average Precision: 0.999365