0

I'm trying to implement my own version of a super learner in python. Here is the code:

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import neighbors
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy import optimize
from pandas.plotting import scatter_matrix
import numpy as np
import pandas as pd 

class SuperLearner(BaseEstimator, RegressorMixin):
    
    def __init__(self, base_estimators):
        self.base_estimators = base_estimators
        self.meta_learner = linear_model.LinearRegression(positive=True)
        self.weights = None

        
    def rss(self, weights, X, y):
        y_pred = np.dot(X, weights)
        return np.sum((y - y_pred)**2)
    
    def constraint(self, weights):
        return np.sum(weights) - 1
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        
        meta_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
        #TODO: modify the number of folds depending on the number of base estimators and the size of the dataset
        kf = KFold(n_splits=5)        
        
        for i, (tran_idx, val_idx) in enumerate(kf.split(X)):
            X_train, X_val = X[tran_idx], X[val_idx]
            y_train, y_val = y[tran_idx], y[val_idx]
            for j, estimator in enumerate(self.base_estimators):
                estimator.fit(X_train, y_train)
                meta_predictions[val_idx, j] = estimator.predict(X_val)
        
        guess = np.empty(len(self.base_estimators))
        bounds = [(0,1)] * len(self.base_estimators)
        
        result = optimize.minimize(self.rss, guess, args=(meta_predictions, y), method='SLSQP', bounds=bounds, constraints={'type':'eq', 'fun':self.constraint})
        print(result.x, np.sum(result.x))
        result = optimize.nnls(meta_predictions, y)
        print(result[0], np.sum(result[0]))
        
        self.meta_learner.fit(meta_predictions, y)
        self.weights= self.meta_learner.coef_
        self.weights= self.weights / np.sum(self.weights)
        
        print(self.weights, np.sum(self.weights))
       
        
        return self
    
    def predict(self, X):
        check_is_fitted(self, 'meta_learner')
        X = check_array(X)
        
        base_predictions = np.zeros((X.shape[0], len(self.base_estimators)), dtype=np.float64)
        for i, estimator in enumerate(self.base_estimators):
            base_predictions[:, i] = estimator.predict(X)
            
        return np.dot(base_predictions, self.weights)

def main():
    np.random.seed(100)
    X, y = datasets.make_friedman1(1000)
    
    ols = linear_model.LinearRegression()
    elastic = linear_model.ElasticNetCV()
    ridge = linear_model.RidgeCV()
    lars = linear_model.LarsCV()
    lasso = linear_model.LassoCV()
    knn = neighbors.KNeighborsRegressor()
    
    superLeaner = SuperLearner([ols, elastic, ridge, lars, lasso, knn])
    
    superLeaner.fit(X, y)
    y_pred = superLeaner.predict(X)
    
    print("MSE: ", np.mean((y_pred - y)**2))
    
    
    
if __name__ == "__main__":
    main()

I use three different methods to evalueat the weights every model should have in the final prediction. While the scipy.nnls and the optimization method I implemented using scipy.minimize produces similar results the LinearRegression of sklearn produces results completly different. I even looked in the code on GitHub of the LinearRegression and it seem like it calls the same scipy function (scipy.nnls) when the positive parameter is set to true like in this case. Anyone knows why?

  • Try to use `scipy.linalg.lstsq` or `numpy.linalg.lstsq`... Check: https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.lstsq.html#scipy.linalg.lstsq or https://numpy.org/devdocs/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq – Joao_PS May 13 '23 at 11:56
  • @Joao_PS But I need the coefficient of the regression to be non negative. This can be enforced in sklearn by setting "True" to the positive parameter, using nnls this is by default true. If I use the function you suggested I expect to have a some negative coefficients which I don't want. My question however is why I get different results using scipy.lnn and the sklearn linear regression despite this last one internally calls scipy.lnn. – Dragos Tanasa May 13 '23 at 14:57

0 Answers0