0

I am currently trying to perform LightGBM probabilities calibration with a custom cross-entropy score and loss function for a binary classification problem. My issue is related to the custom cross-entropy that leads to incompatibility with CalibratedClassifierCV where I got the following error:

calibrated_model.fit(X, y): too many indices for an array: the array is 1-dimensional, but 2 were indexed.

I made my own calibration function with this tutorial. But I end up with a function that returns calibrated probabilities and not a 'full model' that is compatible with mlflow. I thus wonder how could I maintain my script in sklearn and keep something similar to the script below.

Thank you

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split  
from sklearn.datasets import load_breast_cancer
from sklearn.calibration import CalibratedClassifierCV

#  Cross-entropy score and loss function from : https://pub.towardsai.net/outline-a-smaller-class-with-the-custom-loss-function-94ff00359698
def first_grad_logreg_beta(predt,y_true, beta = 4):
    '''Compute the first derivative for custom logloss function'''
    return predt*(beta-beta*y_true+y_true)-y_true

def second_grad_logreg_beta(predt,y_true, beta = 4):
    '''Compute the second derivative for custom logloss function'''
    predt = np.array(predt)
    return (y_true+beta-beta*y_true)*predt*(1-predt)

def logregobj_beta(predt,y_true):
    beta = 4
    predt = np.array(predt)
    y_true = np.array(y_true)
    beta = np.array(beta)

    '''Custom logloss function update'''
    grad=first_grad_logreg_beta(predt,y_true, beta = beta)
    hess=second_grad_logreg_beta(predt,y_true, beta = beta)
    return grad,hess

def logreg_err_beta(predt,y_true, beta = 4):
    predt = np.array(predt)
    y_true = np.array(y_true)
    '''Custom evaluation metric that should be in line with custom loss function'''
    predt=np.clip(predt,10e-7,1-10e-7)
    loss_fn=y_true*np.log(predt)
    loss_fp=(1.0-y_true)*np.log(1.0-predt)
    return np.sum(-(loss_fn+beta*loss_fp))/len(y)

# Analysis---------------------------------------------------
X, y = load_breast_cancer(return_X_y=True)
clf = lgb.LGBMClassifier(objective = logregobj_beta)
clf.fit(X, y, eval_metric = logreg_err_beta)
calibrated_model = CalibratedClassifierCV(clf, method="sigmoid", cv = "prefit")
calibrated_model.fit(X, y) # Issue !!!!
y_pred_calibrated = calibrated_model.predict_proba(X)[:,1]
QHarr
  • 83,427
  • 12
  • 54
  • 101
Sosa
  • 141
  • 9

0 Answers0