I need some help with C4.5 algo. I got one task to write that algo by myself and build a decision tree on a train data and then test it.
Using data from UCI Machine Learning Repository. I wrote some code, got recursion child in child and it built wrong tree.
Here is how i build tree
def build_tree(parent):
for sublist in parent.split_list:
child = Node()
child.parent_atr = parent.atr
child.parent_type = sublist[parent.atr][0]
if entropy(sublist) == 0.0:
child.class_type = sublist[CLASS][0]
parent.children.append(child)
else:
atr, split_list = get_best_attribute(sublist.copy().drop([parent.atr], axis=1))
if atr is None:
child.class_type = round(sum(sublist[CLASS]/len(sublist)))
parent.children.append(child)
else:
child.atr = atr
child.split_list = split_list
parent.children.append(child)
return build_tree(child)
The Node class:
class Node:
children = list()
split_list = pd.DataFrame()
def __int__(self, param_name="", parent_type="", parent_atr="", class_type=None):
self.atr = param_name
self.parent_type = parent_type
self.parent_atr = parent_atr
self.class_type = class_type
and get_best_attribute is
def get_best_attribute(data):
ratios = dict()
dict_list = dict()
if len(data.columns) == 1:
return None, data
for column_name in data:
if column_name == CLASS:
continue
uniq_values = data[column_name].unique()
subarray_list = list()
for value in uniq_values:
subarray_list.append(data.loc[data[column_name] == value].copy())
ratios[column_name] = gain_ratio(data, subarray_list)
dict_list[column_name] = subarray_list
ans = max(ratios.items(), key=operator.itemgetter(1))[0]
return ans, dict_list[ans]
I checkd the formulas (entropy and e.t.c), they work correctly.
When debugging got this recurion.
What I'm doing wrong?