For I school project I've created the following AdaBoost classifier implemententation with Python:
from DecisionStump import *
from sklearn.base import BaseEstimator
class AdaBoost(BaseEstimator):
def __init__(self, boosting_rounds):
self.boosting_rounds = boosting_rounds # number of weak learners used
def fit(self, x, y):
self.feature_names_in_ = np.array(x.columns) # BaseEstimator required
self.n_features_in_ = x.shape[1] # BaseEstimator required
self.weak_learners = [] # [ (learner, amount of say), ... ]
self.decision_stumps = []
n_rows = x.shape[0] # number of examples in training set
attributes = x.columns # features in training set
# dataset example weights
d = np.full(n_rows, 1 / n_rows)
# preparing all decision stumps
for a in attributes:
# finding possible thresholds for decision stump
values = x[a].unique()
values.sort()
thresholds = []
for i in range(1, len(values)):
thresholds.append((values[i] + values[i - 1]) / 2)
for threshold in thresholds:
self.decision_stumps.append(DecisionStump(a, threshold))
for t in range(0, self.boosting_rounds):
# choosing decision stump that minimizes weights dependent error
min_error = float("inf")
h = None # chosen weak learner
for ds in self.decision_stumps:
ds_error = 0
predictions = ds.predict(x)
for i in range(0, n_rows):
if predictions[i] != y.iloc[i][0]: # misclassification
ds_error += d[i]
if ds_error < min_error:
min_error = ds_error
h = ds
# amount of say
small = 1e-10
alpha = 0.5 * np.log((1 - min_error) / (min_error + small)) # small avoids division by 0
# storing learner with corresponding amount of say
self.weak_learners.append((h, alpha))
print("Added weak learner: Decision stump of feature \"" + str(h.attribute) + "\" with threshold " + str(
h.threshold) + " with error " + str(min_error))
# updating weights
wl_predictions = h.predict(x)
for i in range(0, len(d)):
d[i] = d[i] * np.exp(-alpha * y.iloc[i] * wl_predictions[i])
z = np.sum(d) # normalisation factor
for i in range(0, len(d)):
d[i] /= z
return self # BaseEstimator required
def predict(self, x):
ab_prediction = np.zeros(x.shape[0]) # AdaBoost prediction
for i in range(0, len(self.weak_learners)):
wl = self.weak_learners[i][0]
alpha = self.weak_learners[i][1]
weighted_wl_prediction = wl.predict(x) * alpha
ab_prediction = [v + w for v, w in zip(ab_prediction, weighted_wl_prediction)]
ab_prediction = np.sign(ab_prediction)
# casting float items to int
return [int(y) for y in ab_prediction]
# Used when plotting learning curves with scikit-learn
def score(self, x, y):
predictions = self.predict(x)
accuracy = 100 * (predictions == y.values).sum() / len(y)
return accuracy
As you can see I create a DecisionStump for every threshold of every attribute so when I use the fit() method on a dataset containing features with many different values (i.e. height of people) the time it takes to train is way too long. I think i should use a subset of all the possible decision stumps but what criteria should I use? Should I instead group the big-cardinality features in classes (i.e. class1: height between 150 and 155, class2: height between 155 and 160, ...)?