cross entropy loss not equivalent to binary log loss in lgbm

Question

problem trying to solve: compressing training instances by aggregating label (mean of weighed average) and summing weight based on same feature while keeping binary log loss same as cross entropy loss. Here is an example and test cases of log_loss shows that binary log loss is equivalent to weighted log loss.

original data:                                compressed_data

feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1,        0.8                 x1,      1/3,     3,       0.8
    x1,   0,     2,        0.8        -->            
    x2,   1,     2,        0.1                 x2,      2/3,     3,       0.1
    x2,   0,     1,        0.1
    x3,   1,     1,        0.9                 x3,      1,       1,       0.9

issue: binary log loss is not always equivalent to cross entropy loss in lgbm, model performance change (such as log loss, average precision and ROC_AUC) is mild but actual prediction and prediction distribution are quite significant. Experiment 1 shows that they are equivalent in binary label case, while Experiment 2 shows there are certain cases binary log loss does not align with cross entropy (check out examples for more details).

first, verify binary log loss is same as cross entropy loss by numpy

import numpy as np
import pandas as pd 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import lightgbm


# use X of cancer data as training feature for both experiment 1 and 2 
X, _ = load_breast_cancer(return_X_y=True)


def logloss(y_true, y_pred, weight):    
    l = np.mean((-(y_true * np.log(y_pred))-((1-y_true)*np.log(1-y_pred)))*weight)
    # normalize loss
    l = l*y_true.shape[0]/weight.sum()
    return l

"""
feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1/3,       0.7                
    x1,   1,     1/3,       0.7        -->       x1,    2/3,      1,       0.7      
    x1,   0,     1/3,       0.7
"""

l1 = logloss(np.array([1,1,0]), np.array([0.7,0.7,.7]), np.array([1/3,1/3,1/3]))
l2 = logloss(np.array([2/3]), np.array([0.7]), np.array([1]))

"""
feature, label, weight, prediction            feature, label,  weight, prediction 
    x1,   1,     1,        0.8                 x1,      1/3,     3,       0.8
    x1,   0,     2,        0.8        -->            
    x2,   1,     2,        0.1                 x2,      2/3,     3,       0.1
    x2,   0,     1,        0.1
    x3,   1,     1,        0.9                 x3,      1,       1,       0.9
"""
l3 = logloss(np.array([1,0,1,0,1]), 
             np.array([0.8,0.8,0.1,0.1,0.9]), 
             np.array([1,2,2,1,1]))
l4 = logloss(np.array([1/3,2/3,1]), np.array([0.8,0.1,0.9]), np.array([3,3,1]))

np.testing.assert_almost_equal(l1, l2, decimal=4)
np.testing.assert_almost_equal(l3, l4, decimal=4)

experiment 1 (binary log loss is equivalent to cross entropy loss in binary label case):

######## data for experiment 1
np.random.seed(42)
n = X.shape[0]
y_binary = np.random.randint(0,2,size=(n))
eps = 1e-2
y_float = np.random.uniform(eps,1-eps,size=(n))

lgbm_params = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree':1,
    'importance_type': 'split',
    'learning_rate': 0.06472914709339864,
    'max_depth': 46,
    'min_child_weight': 0.001, 
    'min_split_gain': 0.0,
    'n_estimators': 20,
    'n_jobs': 1,
    'num_leaves': 178,
    'random_state': 1574094090,
    'reg_alpha': 0.4894283599023894,
    'reg_lambda': 0.09743058458885945,
    'silent': True,
    'subsample':1,
#     'subsample_for_bin': 200000, # try larger values (10M+)
#     'subsample_freq': 252,
    'min_data_in_bin':1,
    'min_child_samples':1,    
 }

X_train_array, X_test_array, y_train_binary, y_test_binary, y_train_float, y_test_float = \
    train_test_split(X, y_binary, y_float, test_size=0.3, random_state=1)

##### binary label case in sklearn API that binary objective is equivalent to cross_entropy objective
binary_model1 = LGBMClassifier(objective='binary')
binary_model1.set_params(**lgbm_params)

binary_model1.fit(
    X_train_array, 
    y_train_binary, 
    sample_weight=np.ones(X_train_array.shape[0])
)

binary_model2 = LGBMRegressor(objective='cross_entropy')
binary_model2.set_params(**lgbm_params)
binary_model2.fit(
    X_train_array, 
    y_train_binary, 
    sample_weight=np.ones(X_train_array.shape[0])
)

binary_pred_1 = binary_model1.predict_proba(X_test_array)[:,1]
binary_pred_2 = binary_model2.predict(X_test_array)
binary_y_pred_diff = binary_pred_1-binary_pred_2

# binary log loss and cross_entropy loss are same given binary labels
np.testing.assert_almost_equal(binary_pred_1, binary_pred_2, decimal=4)

experiment 2: cross entropy loss can be different from log loss (not sure why)

######## data for experiment 2 

def make_compressed_df(X, fixed_ratio=None): 
    """
    this function stimulates compressed data that instances with same feature will be deduped
    and label becomes mean of these instance labels and weight becomes sum of these instance weight
    ex.
    
    args:
        fixed_ratio: int or None, if int, raito of pos_count/neg_count is consistent (key of the experiment!)
    
    original_data:                  compressed_data: 
    
    feature, label, weight            feature, label, pos_count, neg_count, weight, 
        x1,   1,     1                   
        x1,   1,     1        -->       x1,    2/3,       2,         1,       3
        x1,   0,     1
        -------------------------------------------------
        x2,   0,     1                   
        x2,   1,     1        -->       x2,    1/2,       1,         1,       2
        -------------------------------------------------   
        x3,   1,     1                   
        x3,   1,     1        -->       x3,    2/2,       2,         0,       2
        
    """
    compressed_df = pd.DataFrame(X)
    pos_count = np.random.randint(1,3,size=(X.shape[0]))
    compressed_df['pos_count'] = pos_count 
    if fixed_ratio:
        compressed_df['neg_count'] = int(fixed_ratio)*compressed_df['pos_count']
    else:
        neg_count = np.random.randint(1,3,size=(X.shape[0]))
        compressed_df['neg_count'] = neg_count 
        
    compressed_df['total_count'] = compressed_df['pos_count']+compressed_df['neg_count']

    compressed_df['weight'] = compressed_df['pos_count']+compressed_df['neg_count']
    compressed_df['label'] = compressed_df['pos_count']/compressed_df['total_count']
    
    return compressed_df


def restore_data(df):
    """
    restore original features, labels and weight based on pos_count and neg_count.
    instances with same feature will repeat (pos_count+neg_count) times, labels will become
    [1]*pos_count+[0]*neg_count, and weight becomes weight/(pos_count+neg_count)
    
    ex.
    
        compressed_data:                                     original_data: 
    
        feature, label, pos_count, neg_count, weight         feature, label, weight
                                                                x1,    1,     1
        x1,    2/3,       2,         1,       3        -->      x1,    1,     1
                                                                x1,    0,     1
        -------------------------------------------------
                                                                x2,    0,     1
        x2,    1/2,       1,         1,       2        -->      x2,    1,     1
        -------------------------------------------------                 
                                                                x3,    1,     1
        x3,    2/2,       2,         0,       2        -->      x3,    1,     1
        
        
    """     
    pos_df = df.loc[df.index.repeat(df['pos_count'])]
    pos_df['label'] = 1
    
    neg_df = df.loc[df.index.repeat(df['neg_count'])]
    neg_df['label'] = 0
    
    df = pd.concat([pos_df, neg_df], axis=0)
    del pos_df, neg_df
    df['weight'] = df['weight']/df['total_count']
    df = df.drop(['pos_count', 'neg_count', 'total_count'], axis=1)    
    return df


def make_compressed_and_restored_data(X, fixed_ratio):
    np.random.seed(42)
    compressed_df = make_compressed_df(X, fixed_ratio)
    compressed_train_df, compressed_test_df = train_test_split(
        compressed_df, test_size=0.3, random_state=1)
    
    restored_train_df = restore_data(compressed_train_df)
    restored_test_df = restore_data(compressed_test_df)
    
    return (compressed_train_df, compressed_test_df), (restored_train_df, restored_test_df)


# when ratio of pos_count/neg_count is not fixed, objectives are different
(compressed_train_random_ratio_df, compressed_test_df), \
    (restored_train_random_ratio_df, restored_test_random_ratio_df) = \
    make_compressed_and_restored_data(X, fixed_ratio=None)

model1 = LGBMClassifier(objective='binary')
model1.set_params(**lgbm_params)

model1.fit(
    restored_train_random_ratio_df.iloc[:,:30], 
    restored_train_random_ratio_df['label'], 
    sample_weight=restored_train_random_ratio_df['weight']
)

model2 = LGBMRegressor(objective='cross_entropy')
model2.set_params(**lgbm_params)
model2.fit(
    compressed_train_random_ratio_df.iloc[:,:30], 
    compressed_train_random_ratio_df['label'], 
    sample_weight=compressed_train_random_ratio_df['weight']
)

y1 = model1.predict_proba(compressed_test_df.iloc[:,:30])[:,1]
y2 = model2.predict(compressed_test_df.iloc[:,:30])
# this assertion fails
np.testing.assert_almost_equal(y1, y2, decimal=4)


# when ratio of pos_count/neg_count is  fixed, objectives are same
(compressed_train_fixed_ratio_df, compressed_test_fixed_ratio_df), \
    (restored_train_fixed_ratio_df, restored_test_fixed_ratio_df) = \
    make_compressed_and_restored_data(X, fixed_ratio=2)

model3 = LGBMClassifier(objective='binary')
model3.set_params(**lgbm_params)

model3.fit(
    restored_train_fixed_ratio_df.iloc[:,:30], 
    restored_train_fixed_ratio_df['label'], 
    sample_weight=restored_train_fixed_ratio_df['weight']
)

model4 = LGBMRegressor(objective='cross_entropy')
model4.set_params(**lgbm_params)
model4.fit(
    compressed_train_fixed_ratio_df.iloc[:,:30], 
    compressed_train_fixed_ratio_df['label'], 
    sample_weight=compressed_train_fixed_ratio_df['weight']
)

y3 = model3.predict_proba(compressed_test_fixed_ratio_df.iloc[:,:30])[:,1]
y4 = model4.predict(compressed_test_fixed_ratio_df.iloc[:,:30])
# this assertion passes
np.testing.assert_almost_equal(y3, y4, decimal=4)

score 0 · Answer 1 · answered Nov 18 '20 at 03:31

0

It looks like this question was cross-posted here and in the official LightGBM repo.

LightGBM maintainers have provided an answer there: https://github.com/microsoft/LightGBM/issues/3576.

answered Nov 18 '20 at 03:31

James Lamb

1,662
12
16

you are right, that's the one I posted on github but haven't been resolved yet. – torchflow Nov 19 '20 at 04:05

cross entropy loss not equivalent to binary log loss in lgbm

1 Answers1