I am using scikit-learn Random Forest Classifier for a binary classification problem with imbalanced classes (negative class: 80%, positive class: 20%). When I apply the model on the same training dataset or test dataset the proportion of predicted positive class is significantly lower compared to the actual proportion in the data (16% vs 20%). I expect the proportion of predicted classes always to be close the the actual.
Below is an example of code:
# Import libraries
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Generate data
X, Y = make_classification(n_samples=100000,
n_features=30,
n_redundant=0,
n_classes=2,
random_state=17,
weights = [0.8, 0.2])
# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=7)
# Train random forest classifier
model = RandomForestClassifier(n_estimators = 50,
random_state = 17,
criterion = 'entropy',
max_features = 'sqrt',
max_depth = 10
)
model.fit(x_train, y_train)
# Check average predicted score and proportion of predicted positive cases
# Train
y_pred_prob_train = model.predict_proba(x_train)
y_pred_class_train = model.predict(x_train)
print('Train: Average predicted score: {:.2%}'.format(np.mean(y_pred_prob_train[:,1])))
print('Train: Predicted % of positive cases: {:.2%}'.format(y_pred_class_train.sum()/len(y_pred_class_train)))
# Test
y_pred_prob_test = model.predict_proba(x_test)
y_pred_class_test = model.predict(x_test)
print('\nTest: Average predicted score: {:.2%}'.format(np.mean(y_pred_prob_test[:,1])))
print('Test: Predicted % of positive class: {:.2%}'.format(y_pred_class_test.sum()/len(y_pred_class_test)))
The model performance looks very similar on train and test datasets so it doesn't seem that the model would be overfitting.
# Get accuracy score and confusion matrix for train and test datasets
acc_train = model.score(x_train, y_train)
acc_test = model.score(x_test, y_test)
cm_train = confusion_matrix(y_train, y_pred_class_train, normalize = 'true')
cl_report_train = classification_report(y_train, y_pred_class_train)
cm_test = confusion_matrix(y_test, y_pred_class_test, normalize = 'true')
cl_report_test = classification_report(y_test, y_pred_class_test)
# Print results
print('MODEL ACCURACY:\n \
training data: {:.2%}\n \
test data: {:.2%}'.format(acc_train, acc_test))
print('\nCONFUSION MATRIX (train data):\n {}'.format(cm_train.round(3)))
print('\nCLASSIFICATION REPORT (train data):\n {}'.format(cl_report_train))
print('\nCONFUSION MATRIX (test data):\n {}'.format(cm_test.round(3)))
print('\nCLASSIFICATION REPORT (test data):\n {}'.format(cl_report_test))
What is the reason the proportion of predicted positive cases is so different from the actual proportion? I expect that the two should always be relatively close using the default threshold of 0.5. Is there any parameter that I am missing when fitting the model?