0

I printed the "Best accuracy" print("Best accuracy:", best_accuracy) for my model within the console, and it shows me Best accuracy: 0.88, whereas the accuracy for my specific model is Accuracy: 0.83

Is there any chance to change anything on code or parameters to find out how to get to the best accuarcy

Best accuracy: 0.8878504672897196
Best model: DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=6,
                       min_samples_split=5)
Accuracy: 0.8333333333333334
Klassifikation Report:
               precision    recall  f1-score   support

           0       0.83      0.99      0.90        83
           1       0.89      0.32      0.47        25

    accuracy                           0.83       108
   macro avg       0.86      0.65      0.69       108
weighted avg       0.84      0.83      0.80       108

Parameters are following:

# Training, Validation and Test set split
X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, test_size=0.2 , random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, test_size=0.5, random_state=42)

best_clf = None
best_accuracy = 0.0

# Loop over different max_depths
for max_depth in range(1, 20):  

    # Decision Tree Classifier and Training
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=4, min_samples_split=5, min_samples_leaf=6 )
    clf.fit(X_train, Y_train)

See entire code here for more information:

import numpy as np
import pandas as pd
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED
import matplotlib.pyplot as plt

# Load CSV file
data = pd.read_csv("Basis_DecisionTree_1106.csv", sep=";", header=0)

# Exclude Personalnummer and Maschine
data = data.drop(columns=['PersNr',])

# Convert the categorical variable Maschine
data['Maschine'].replace(['Stock Order', 'New Machine'], [0, 1], inplace=True)

# Convert the columns 'Gehalt_PY' and 'AU_Detail' from string to float
def convert_currency(val):
    new_val = val.replace('.', '').replace(',', '.')
    return float(new_val)

def convert_decimal(val):
    new_val = val.replace(',', '.')
    return float(new_val)

data['Gehalt_PY'] = data['Gehalt_PY'].apply(convert_currency)
data['AU_Detail'] = data['AU_Detail'].apply(convert_decimal)
data['VZK_PY'] = data['VZK_PY'].apply(convert_decimal)

# Apply One-Hot-Encoding to the column 'Arbeitsplatz_Technologie'
one_hot = pd.get_dummies(data['Arbeitsplatz_Technologie'])
# Drop column 'Arbeitsplatz_Technologie' as it is now encoded
data = data.drop('Arbeitsplatz_Technologie',axis = 1)
# Join the encoded df
data = data.join(one_hot)

# X & Y Variables
feature_names = ['Age', 'Company Affiliation', 'AU_Detail', 'VZK_PY', 'P_noTravelDays', 'Marital Status', 'Children', 'Salary_PY', 'Machine', 'P_TravelDays', 'AU', 'Presence_PY', 'P_A', 'P_AP', 'P_C', 'P_EW', 'P_EUR', 'P_GER', 'P_MEA', 'P_NAmerica', 'P_SAmerica', 'Dispatching_Level'] + list(one_hot.columns)

Y = data['Churn']
X = data[feature_names]

# Training, Validation and Test set split
X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, test_size=0.2 , random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, test_size=0.5, random_state=42)

best_clf = None
best_accuracy = 0.0

# Loop over different max_depths
for max_depth in range(1, 20):  

    # Decision Tree Classifier and Training
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=4, min_samples_split=5, min_samples_leaf=6 )
    clf.fit(X_train, Y_train)


    # Predictions on the validation set
    Y_val_pred = clf.predict(X_val)

    # Evaluate the predictions
    accuracy = accuracy_score(Y_val, Y_val_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_clf = clf

print("Best accuracy:", best_accuracy)
print("Best model:", best_clf)


# Predictions on the test set with the best model
Y_test_pred = best_clf.predict(X_test)


# Post-Pruning
def is_leaf(inner_tree, index):
    # Check whether node is leaf node
    return (inner_tree.children_left[index] == TREE_LEAF and 
            inner_tree.children_right[index] == TREE_LEAF)

def prune_index(inner_tree, decisions, index=0):
    # Start pruning from the bottom - if we start from the top, we might miss
    # nodes that become leaves during pruning.
    # Do not use this directly - use prune_duplicate_leaves instead.
    if not is_leaf(inner_tree, inner_tree.children_left[index]):
        prune_index(inner_tree, decisions, inner_tree.children_left[index])
    if not is_leaf(inner_tree, inner_tree.children_right[index]):
        prune_index(inner_tree, decisions, inner_tree.children_right[index])

    # Prune children if both children are leaves now and make the same decision:     
    if (is_leaf(inner_tree, inner_tree.children_left[index]) and
        is_leaf(inner_tree, inner_tree.children_right[index]) and
        (decisions[index] == decisions[inner_tree.children_left[index]]) and 
        (decisions[index] == decisions[inner_tree.children_right[index]])):
        # turn node into a leaf by "unlinking" its children
        inner_tree.children_left[index] = TREE_LEAF
        inner_tree.children_right[index] = TREE_LEAF
        inner_tree.feature[index] = TREE_UNDEFINED
        ##print("Pruned {}".format(index))

def prune_duplicate_leaves(mdl):
    # Remove leaves if both 
    decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
    prune_index(mdl.tree_, decisions)
    


# Feature Importance
importance = best_clf.feature_importances_

# Create a DataFrame from Features and their Importance
feature_importance = pd.DataFrame(list(zip(feature_names, importance)), 
                                  columns = ['Feature', 'Importance'])

# Sort the DataFrame by Importance
feature_importance = feature_importance.sort_values('Importance', ascending = False)

# Display the sorted Feature Importance
print(feature_importance)

# Plot the Feature Importance
plt.bar(feature_importance['Feature'], feature_importance['Importance'])
plt.xticks(rotation='vertical')
plt.show()

# # Feature Importance
# importance = best_clf.feature_importances_

# # summarizing feature importance
# for i,v in enumerate(importance):
#     print('Feature: %s, Score: %.5f' % (feature_names[i],v))

# # plot feature importance
# plt.bar([x for x in range(len(importance))], importance)
# plt.xticks([x for x in range(len(importance))], feature_names, rotation='vertical')
# plt.show()

    
# Evaluate the predictions
accuracy = accuracy_score(Y_test, Y_test_pred)
print("Accuracy:", accuracy) 

classificationReport = classification_report(Y_test, Y_test_pred)
print("Classification Report:\n", classificationReport)

confusionMatrix = confusion_matrix(Y_test, Y_test_pred)
print("Confusion Matrix:\n", confusionMatrix)

# Visualizing the decision tree with Graphviz
dot_data = export_graphviz(best_clf, out_file=None, feature_names=feature_names, class_names=["0", "1"], filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('prediction_tree_pruning.png')

Change of Parameters max_depth=4, min_samples_split=5

Elisa
  • 15
  • 5

1 Answers1

0
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, ...)

The reason why "Best accuracy and Accuracy is different" is you get "Best accuracy" from X_val/Y_val

accuracy_score(Y_val, Y_val_pred)

and get "accuracy" from X_test/Y_test

accuracy = accuracy_score(Y_test, Y_test_pred)

As data in X_val/Y_val and X_test/Y_test are different, you shouldn't expect score same in both of them.

linpingta
  • 2,324
  • 2
  • 18
  • 36
  • Thanks a lot for your explanations, @linpingta ! do you somehow see any other possibility to get a better accuracy within the test set? – Elisa Jun 13 '23 at 08:26
  • You may not do that (I mean access test data in training process) because it will lead a overfitting of model (because in reality, you may not know test_y). Just use validation set to protect overfitting with early-stop is enough – linpingta Jun 13 '23 at 15:36