I am currently trying to perform LightGBM
probabilities calibration with a custom cross-entropy score and loss function for a binary classification problem. My issue is related to the custom cross-entropy that leads to incompatibility with CalibratedClassifierCV
where I got the following error:
calibrated_model.fit(X, y): too many indices for an array: the array is 1-dimensional, but 2 were indexed.
I made my own calibration function with this tutorial. But I end up with a function that returns calibrated probabilities and not a 'full model' that is compatible with mlflow
. I thus wonder how could I maintain my script in sklearn
and keep something similar to the script below.
Thank you
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.calibration import CalibratedClassifierCV
# Cross-entropy score and loss function from : https://pub.towardsai.net/outline-a-smaller-class-with-the-custom-loss-function-94ff00359698
def first_grad_logreg_beta(predt,y_true, beta = 4):
'''Compute the first derivative for custom logloss function'''
return predt*(beta-beta*y_true+y_true)-y_true
def second_grad_logreg_beta(predt,y_true, beta = 4):
'''Compute the second derivative for custom logloss function'''
predt = np.array(predt)
return (y_true+beta-beta*y_true)*predt*(1-predt)
def logregobj_beta(predt,y_true):
beta = 4
predt = np.array(predt)
y_true = np.array(y_true)
beta = np.array(beta)
'''Custom logloss function update'''
grad=first_grad_logreg_beta(predt,y_true, beta = beta)
hess=second_grad_logreg_beta(predt,y_true, beta = beta)
return grad,hess
def logreg_err_beta(predt,y_true, beta = 4):
predt = np.array(predt)
y_true = np.array(y_true)
'''Custom evaluation metric that should be in line with custom loss function'''
predt=np.clip(predt,10e-7,1-10e-7)
loss_fn=y_true*np.log(predt)
loss_fp=(1.0-y_true)*np.log(1.0-predt)
return np.sum(-(loss_fn+beta*loss_fp))/len(y)
# Analysis---------------------------------------------------
X, y = load_breast_cancer(return_X_y=True)
clf = lgb.LGBMClassifier(objective = logregobj_beta)
clf.fit(X, y, eval_metric = logreg_err_beta)
calibrated_model = CalibratedClassifierCV(clf, method="sigmoid", cv = "prefit")
calibrated_model.fit(X, y) # Issue !!!!
y_pred_calibrated = calibrated_model.predict_proba(X)[:,1]