I printed the "Best accuracy" print("Best accuracy:", best_accuracy)
for my model within the console, and it shows me Best accuracy: 0.88
, whereas the accuracy for my specific model is Accuracy: 0.83
Is there any chance to change anything on code or parameters to find out how to get to the best accuarcy
Best accuracy: 0.8878504672897196
Best model: DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=6,
min_samples_split=5)
Accuracy: 0.8333333333333334
Klassifikation Report:
precision recall f1-score support
0 0.83 0.99 0.90 83
1 0.89 0.32 0.47 25
accuracy 0.83 108
macro avg 0.86 0.65 0.69 108
weighted avg 0.84 0.83 0.80 108
Parameters are following:
# Training, Validation and Test set split
X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, test_size=0.2 , random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, test_size=0.5, random_state=42)
best_clf = None
best_accuracy = 0.0
# Loop over different max_depths
for max_depth in range(1, 20):
# Decision Tree Classifier and Training
clf = DecisionTreeClassifier(criterion="entropy", max_depth=4, min_samples_split=5, min_samples_leaf=6 )
clf.fit(X_train, Y_train)
See entire code here for more information:
import numpy as np
import pandas as pd
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED
import matplotlib.pyplot as plt
# Load CSV file
data = pd.read_csv("Basis_DecisionTree_1106.csv", sep=";", header=0)
# Exclude Personalnummer and Maschine
data = data.drop(columns=['PersNr',])
# Convert the categorical variable Maschine
data['Maschine'].replace(['Stock Order', 'New Machine'], [0, 1], inplace=True)
# Convert the columns 'Gehalt_PY' and 'AU_Detail' from string to float
def convert_currency(val):
new_val = val.replace('.', '').replace(',', '.')
return float(new_val)
def convert_decimal(val):
new_val = val.replace(',', '.')
return float(new_val)
data['Gehalt_PY'] = data['Gehalt_PY'].apply(convert_currency)
data['AU_Detail'] = data['AU_Detail'].apply(convert_decimal)
data['VZK_PY'] = data['VZK_PY'].apply(convert_decimal)
# Apply One-Hot-Encoding to the column 'Arbeitsplatz_Technologie'
one_hot = pd.get_dummies(data['Arbeitsplatz_Technologie'])
# Drop column 'Arbeitsplatz_Technologie' as it is now encoded
data = data.drop('Arbeitsplatz_Technologie',axis = 1)
# Join the encoded df
data = data.join(one_hot)
# X & Y Variables
feature_names = ['Age', 'Company Affiliation', 'AU_Detail', 'VZK_PY', 'P_noTravelDays', 'Marital Status', 'Children', 'Salary_PY', 'Machine', 'P_TravelDays', 'AU', 'Presence_PY', 'P_A', 'P_AP', 'P_C', 'P_EW', 'P_EUR', 'P_GER', 'P_MEA', 'P_NAmerica', 'P_SAmerica', 'Dispatching_Level'] + list(one_hot.columns)
Y = data['Churn']
X = data[feature_names]
# Training, Validation and Test set split
X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, test_size=0.2 , random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, test_size=0.5, random_state=42)
best_clf = None
best_accuracy = 0.0
# Loop over different max_depths
for max_depth in range(1, 20):
# Decision Tree Classifier and Training
clf = DecisionTreeClassifier(criterion="entropy", max_depth=4, min_samples_split=5, min_samples_leaf=6 )
clf.fit(X_train, Y_train)
# Predictions on the validation set
Y_val_pred = clf.predict(X_val)
# Evaluate the predictions
accuracy = accuracy_score(Y_val, Y_val_pred)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_clf = clf
print("Best accuracy:", best_accuracy)
print("Best model:", best_clf)
# Predictions on the test set with the best model
Y_test_pred = best_clf.predict(X_test)
# Post-Pruning
def is_leaf(inner_tree, index):
# Check whether node is leaf node
return (inner_tree.children_left[index] == TREE_LEAF and
inner_tree.children_right[index] == TREE_LEAF)
def prune_index(inner_tree, decisions, index=0):
# Start pruning from the bottom - if we start from the top, we might miss
# nodes that become leaves during pruning.
# Do not use this directly - use prune_duplicate_leaves instead.
if not is_leaf(inner_tree, inner_tree.children_left[index]):
prune_index(inner_tree, decisions, inner_tree.children_left[index])
if not is_leaf(inner_tree, inner_tree.children_right[index]):
prune_index(inner_tree, decisions, inner_tree.children_right[index])
# Prune children if both children are leaves now and make the same decision:
if (is_leaf(inner_tree, inner_tree.children_left[index]) and
is_leaf(inner_tree, inner_tree.children_right[index]) and
(decisions[index] == decisions[inner_tree.children_left[index]]) and
(decisions[index] == decisions[inner_tree.children_right[index]])):
# turn node into a leaf by "unlinking" its children
inner_tree.children_left[index] = TREE_LEAF
inner_tree.children_right[index] = TREE_LEAF
inner_tree.feature[index] = TREE_UNDEFINED
##print("Pruned {}".format(index))
def prune_duplicate_leaves(mdl):
# Remove leaves if both
decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
prune_index(mdl.tree_, decisions)
# Feature Importance
importance = best_clf.feature_importances_
# Create a DataFrame from Features and their Importance
feature_importance = pd.DataFrame(list(zip(feature_names, importance)),
columns = ['Feature', 'Importance'])
# Sort the DataFrame by Importance
feature_importance = feature_importance.sort_values('Importance', ascending = False)
# Display the sorted Feature Importance
print(feature_importance)
# Plot the Feature Importance
plt.bar(feature_importance['Feature'], feature_importance['Importance'])
plt.xticks(rotation='vertical')
plt.show()
# # Feature Importance
# importance = best_clf.feature_importances_
# # summarizing feature importance
# for i,v in enumerate(importance):
# print('Feature: %s, Score: %.5f' % (feature_names[i],v))
# # plot feature importance
# plt.bar([x for x in range(len(importance))], importance)
# plt.xticks([x for x in range(len(importance))], feature_names, rotation='vertical')
# plt.show()
# Evaluate the predictions
accuracy = accuracy_score(Y_test, Y_test_pred)
print("Accuracy:", accuracy)
classificationReport = classification_report(Y_test, Y_test_pred)
print("Classification Report:\n", classificationReport)
confusionMatrix = confusion_matrix(Y_test, Y_test_pred)
print("Confusion Matrix:\n", confusionMatrix)
# Visualizing the decision tree with Graphviz
dot_data = export_graphviz(best_clf, out_file=None, feature_names=feature_names, class_names=["0", "1"], filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('prediction_tree_pruning.png')
Change of Parameters max_depth=4, min_samples_split=5