problem trying to solve: compressing training instances by aggregating label (mean of weighed average) and summing weight based on same feature while keeping binary log loss same as cross entropy loss. Here is an example and test cases of log_loss shows that binary log loss is equivalent to weighted log loss.
original data: compressed_data
feature, label, weight, prediction feature, label, weight, prediction
x1, 1, 1, 0.8 x1, 1/3, 3, 0.8
x1, 0, 2, 0.8 -->
x2, 1, 2, 0.1 x2, 2/3, 3, 0.1
x2, 0, 1, 0.1
x3, 1, 1, 0.9 x3, 1, 1, 0.9
issue: binary log loss is not always equivalent to cross entropy loss in lgbm, model performance change (such as log loss, average precision and ROC_AUC) is mild but actual prediction and prediction distribution are quite significant. Experiment 1 shows that they are equivalent in binary label case, while Experiment 2 shows there are certain cases binary log loss does not align with cross entropy (check out examples for more details).
first, verify binary log loss is same as cross entropy loss by numpy
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
import lightgbm
# use X of cancer data as training feature for both experiment 1 and 2
X, _ = load_breast_cancer(return_X_y=True)
def logloss(y_true, y_pred, weight):
l = np.mean((-(y_true * np.log(y_pred))-((1-y_true)*np.log(1-y_pred)))*weight)
# normalize loss
l = l*y_true.shape[0]/weight.sum()
return l
"""
feature, label, weight, prediction feature, label, weight, prediction
x1, 1, 1/3, 0.7
x1, 1, 1/3, 0.7 --> x1, 2/3, 1, 0.7
x1, 0, 1/3, 0.7
"""
l1 = logloss(np.array([1,1,0]), np.array([0.7,0.7,.7]), np.array([1/3,1/3,1/3]))
l2 = logloss(np.array([2/3]), np.array([0.7]), np.array([1]))
"""
feature, label, weight, prediction feature, label, weight, prediction
x1, 1, 1, 0.8 x1, 1/3, 3, 0.8
x1, 0, 2, 0.8 -->
x2, 1, 2, 0.1 x2, 2/3, 3, 0.1
x2, 0, 1, 0.1
x3, 1, 1, 0.9 x3, 1, 1, 0.9
"""
l3 = logloss(np.array([1,0,1,0,1]),
np.array([0.8,0.8,0.1,0.1,0.9]),
np.array([1,2,2,1,1]))
l4 = logloss(np.array([1/3,2/3,1]), np.array([0.8,0.1,0.9]), np.array([3,3,1]))
np.testing.assert_almost_equal(l1, l2, decimal=4)
np.testing.assert_almost_equal(l3, l4, decimal=4)
experiment 1 (binary log loss is equivalent to cross entropy loss in binary label case):
######## data for experiment 1
np.random.seed(42)
n = X.shape[0]
y_binary = np.random.randint(0,2,size=(n))
eps = 1e-2
y_float = np.random.uniform(eps,1-eps,size=(n))
lgbm_params = {
'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree':1,
'importance_type': 'split',
'learning_rate': 0.06472914709339864,
'max_depth': 46,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 20,
'n_jobs': 1,
'num_leaves': 178,
'random_state': 1574094090,
'reg_alpha': 0.4894283599023894,
'reg_lambda': 0.09743058458885945,
'silent': True,
'subsample':1,
# 'subsample_for_bin': 200000, # try larger values (10M+)
# 'subsample_freq': 252,
'min_data_in_bin':1,
'min_child_samples':1,
}
X_train_array, X_test_array, y_train_binary, y_test_binary, y_train_float, y_test_float = \
train_test_split(X, y_binary, y_float, test_size=0.3, random_state=1)
##### binary label case in sklearn API that binary objective is equivalent to cross_entropy objective
binary_model1 = LGBMClassifier(objective='binary')
binary_model1.set_params(**lgbm_params)
binary_model1.fit(
X_train_array,
y_train_binary,
sample_weight=np.ones(X_train_array.shape[0])
)
binary_model2 = LGBMRegressor(objective='cross_entropy')
binary_model2.set_params(**lgbm_params)
binary_model2.fit(
X_train_array,
y_train_binary,
sample_weight=np.ones(X_train_array.shape[0])
)
binary_pred_1 = binary_model1.predict_proba(X_test_array)[:,1]
binary_pred_2 = binary_model2.predict(X_test_array)
binary_y_pred_diff = binary_pred_1-binary_pred_2
# binary log loss and cross_entropy loss are same given binary labels
np.testing.assert_almost_equal(binary_pred_1, binary_pred_2, decimal=4)
experiment 2: cross entropy loss can be different from log loss (not sure why)
######## data for experiment 2
def make_compressed_df(X, fixed_ratio=None):
"""
this function stimulates compressed data that instances with same feature will be deduped
and label becomes mean of these instance labels and weight becomes sum of these instance weight
ex.
args:
fixed_ratio: int or None, if int, raito of pos_count/neg_count is consistent (key of the experiment!)
original_data: compressed_data:
feature, label, weight feature, label, pos_count, neg_count, weight,
x1, 1, 1
x1, 1, 1 --> x1, 2/3, 2, 1, 3
x1, 0, 1
-------------------------------------------------
x2, 0, 1
x2, 1, 1 --> x2, 1/2, 1, 1, 2
-------------------------------------------------
x3, 1, 1
x3, 1, 1 --> x3, 2/2, 2, 0, 2
"""
compressed_df = pd.DataFrame(X)
pos_count = np.random.randint(1,3,size=(X.shape[0]))
compressed_df['pos_count'] = pos_count
if fixed_ratio:
compressed_df['neg_count'] = int(fixed_ratio)*compressed_df['pos_count']
else:
neg_count = np.random.randint(1,3,size=(X.shape[0]))
compressed_df['neg_count'] = neg_count
compressed_df['total_count'] = compressed_df['pos_count']+compressed_df['neg_count']
compressed_df['weight'] = compressed_df['pos_count']+compressed_df['neg_count']
compressed_df['label'] = compressed_df['pos_count']/compressed_df['total_count']
return compressed_df
def restore_data(df):
"""
restore original features, labels and weight based on pos_count and neg_count.
instances with same feature will repeat (pos_count+neg_count) times, labels will become
[1]*pos_count+[0]*neg_count, and weight becomes weight/(pos_count+neg_count)
ex.
compressed_data: original_data:
feature, label, pos_count, neg_count, weight feature, label, weight
x1, 1, 1
x1, 2/3, 2, 1, 3 --> x1, 1, 1
x1, 0, 1
-------------------------------------------------
x2, 0, 1
x2, 1/2, 1, 1, 2 --> x2, 1, 1
-------------------------------------------------
x3, 1, 1
x3, 2/2, 2, 0, 2 --> x3, 1, 1
"""
pos_df = df.loc[df.index.repeat(df['pos_count'])]
pos_df['label'] = 1
neg_df = df.loc[df.index.repeat(df['neg_count'])]
neg_df['label'] = 0
df = pd.concat([pos_df, neg_df], axis=0)
del pos_df, neg_df
df['weight'] = df['weight']/df['total_count']
df = df.drop(['pos_count', 'neg_count', 'total_count'], axis=1)
return df
def make_compressed_and_restored_data(X, fixed_ratio):
np.random.seed(42)
compressed_df = make_compressed_df(X, fixed_ratio)
compressed_train_df, compressed_test_df = train_test_split(
compressed_df, test_size=0.3, random_state=1)
restored_train_df = restore_data(compressed_train_df)
restored_test_df = restore_data(compressed_test_df)
return (compressed_train_df, compressed_test_df), (restored_train_df, restored_test_df)
# when ratio of pos_count/neg_count is not fixed, objectives are different
(compressed_train_random_ratio_df, compressed_test_df), \
(restored_train_random_ratio_df, restored_test_random_ratio_df) = \
make_compressed_and_restored_data(X, fixed_ratio=None)
model1 = LGBMClassifier(objective='binary')
model1.set_params(**lgbm_params)
model1.fit(
restored_train_random_ratio_df.iloc[:,:30],
restored_train_random_ratio_df['label'],
sample_weight=restored_train_random_ratio_df['weight']
)
model2 = LGBMRegressor(objective='cross_entropy')
model2.set_params(**lgbm_params)
model2.fit(
compressed_train_random_ratio_df.iloc[:,:30],
compressed_train_random_ratio_df['label'],
sample_weight=compressed_train_random_ratio_df['weight']
)
y1 = model1.predict_proba(compressed_test_df.iloc[:,:30])[:,1]
y2 = model2.predict(compressed_test_df.iloc[:,:30])
# this assertion fails
np.testing.assert_almost_equal(y1, y2, decimal=4)
# when ratio of pos_count/neg_count is fixed, objectives are same
(compressed_train_fixed_ratio_df, compressed_test_fixed_ratio_df), \
(restored_train_fixed_ratio_df, restored_test_fixed_ratio_df) = \
make_compressed_and_restored_data(X, fixed_ratio=2)
model3 = LGBMClassifier(objective='binary')
model3.set_params(**lgbm_params)
model3.fit(
restored_train_fixed_ratio_df.iloc[:,:30],
restored_train_fixed_ratio_df['label'],
sample_weight=restored_train_fixed_ratio_df['weight']
)
model4 = LGBMRegressor(objective='cross_entropy')
model4.set_params(**lgbm_params)
model4.fit(
compressed_train_fixed_ratio_df.iloc[:,:30],
compressed_train_fixed_ratio_df['label'],
sample_weight=compressed_train_fixed_ratio_df['weight']
)
y3 = model3.predict_proba(compressed_test_fixed_ratio_df.iloc[:,:30])[:,1]
y4 = model4.predict(compressed_test_fixed_ratio_df.iloc[:,:30])
# this assertion passes
np.testing.assert_almost_equal(y3, y4, decimal=4)