I'm trying to implement my own version of a super learner in python. Here is the code:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import neighbors
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy import optimize
from pandas.plotting import scatter_matrix
import numpy as np
import pandas as pd
class SuperLearner(BaseEstimator, RegressorMixin):
def __init__(self, base_estimators):
self.base_estimators = base_estimators
self.meta_learner = linear_model.LinearRegression(positive=True)
self.weights = None
def rss(self, weights, X, y):
y_pred = np.dot(X, weights)
return np.sum((y - y_pred)**2)
def constraint(self, weights):
return np.sum(weights) - 1
def fit(self, X, y):
X, y = check_X_y(X, y)
meta_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
#TODO: modify the number of folds depending on the number of base estimators and the size of the dataset
kf = KFold(n_splits=5)
for i, (tran_idx, val_idx) in enumerate(kf.split(X)):
X_train, X_val = X[tran_idx], X[val_idx]
y_train, y_val = y[tran_idx], y[val_idx]
for j, estimator in enumerate(self.base_estimators):
estimator.fit(X_train, y_train)
meta_predictions[val_idx, j] = estimator.predict(X_val)
guess = np.empty(len(self.base_estimators))
bounds = [(0,1)] * len(self.base_estimators)
result = optimize.minimize(self.rss, guess, args=(meta_predictions, y), method='SLSQP', bounds=bounds, constraints={'type':'eq', 'fun':self.constraint})
print(result.x, np.sum(result.x))
result = optimize.nnls(meta_predictions, y)
print(result[0], np.sum(result[0]))
self.meta_learner.fit(meta_predictions, y)
self.weights= self.meta_learner.coef_
self.weights= self.weights / np.sum(self.weights)
print(self.weights, np.sum(self.weights))
return self
def predict(self, X):
check_is_fitted(self, 'meta_learner')
X = check_array(X)
base_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
for i, estimator in enumerate(self.base_estimators):
base_predictions[:, i] = estimator.predict(X)
return np.dot(base_predictions, self.weights)
def main():
np.random.seed(100)
X, y = datasets.make_friedman1(1000)
ols = linear_model.LinearRegression()
elastic = linear_model.ElasticNetCV()
ridge = linear_model.RidgeCV()
lars = linear_model.LarsCV()
lasso = linear_model.LassoCV()
knn = neighbors.KNeighborsRegressor()
superLeaner = SuperLearner([ols, elastic, ridge, lars, lasso, knn])
superLeaner.fit(X, y)
y_pred = superLeaner.predict(X)
print("MSE: ", np.mean((y_pred - y)**2))
if __name__ == "__main__":
main()
I use three different methods to evalueat the weights every model should have in the final prediction. While the scipy.nnls and the optimization method I implemented using scipy.minimize produces similar results the LinearRegression of sklearn produces results completly different. I even looked in the code on GitHub of the LinearRegression and it seem like it calls the same scipy function (scipy.nnls) when the positive parameter is set to true like in this case. Anyone knows why?