I'm relatively new to Decision Trees and I'm stuck with my decision tree algorithm. I'm using cross-validation and parameter tuning to optimize the classification following this example: https://medium.com/@haydar_ai/learning-data-science-day-22-cross-validation-and-parameter-tuning-b14bcbc6b012. But however I tune my parameters I always get results looking like this (here just an example for a small tree):
I don't understand the reasons for this behaviour. Why does the tree generate leaves with the same class (here class2)? Why does it not simply stop after a<=0.375 = TRUE and cut of the leaves with the same class (see picture red rectangle)? Is there a way to prevent this and make the algorithm stop at this point? Or is there a reasonable explanation for this behaviour? Any help or ideas would be highly appreciated! Thanks!
EDIT: Here is my code:
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Load dataset
filename = 'C:/Test.csv'
dataset = load_csv(filename)
# convert string columns to float
for i in range(len(dataset[0])):
str_column_to_float(dataset, i)
# Transform to x and y
x = []
xpart = []
y = []
for row in dataset:
for i in range(len(row)):
if i != (len(row) - 1):
xpart.append(row[i])
else:
y.append(row[i])
x.append(xpart)
xpart = []
features_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
labels = ['class1', 'class2']
#here I tried to tune the parameters
#(I changed them several times, this is just an example to show, how the code looks like).
# However, I always ended up with terminal leaves with same classes
"""dtree=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
max_features=8, max_leaf_nodes=None, min_impurity_decrease = 0.0, min_impurity_split = None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='random')"""
#here, I created the small example
dtree = DecisionTreeClassifier(max_depth=2)
dtree.fit(x,y)
dot_data = tree.export_graphviz(dtree, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("Result")
dot_data = tree.export_graphviz(dtree, out_file=None,
feature_names= features_names,
class_names=labels,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph.format = 'png'
graph.render('Result', view = True)
... and a snapshot of my Data: