Implementing sklearn SGD with l2 regularization from scratch

Question

I need some help or an idea about what's going wrong in my code. I am trying to implement the SGD regressor using l2, but the bias in my model reaches very high value (when alpha is above 10). I think something is wrong with gradient and its calculations.

This is my SGD class:

class SGDRegression1():
    def __init__(self, n_iter, eta0, seed = None, alpha = None) -> None:
        self.n_iter = n_iter
        self.eta0 = eta0
        self.alpha = alpha

        if seed is not None:
            self.rng = np.random.default_rng(seed=seed)


    def __add_bias(self, X):
        return np.c_[np.ones((X.T.shape[-1],1)), X]
  
    def learning_schedule_optimal(self, t):

        return 1 / (t + self.eta0)

    def fit(self, X:np.ndarray, y):
        X_b = self.__add_bias(X)
        
        self.theta = self.rng.standard_normal((X_b.shape[1], 1))

        m = len(X_b)
        for epoch in range(self.n_iter):
            for iteration in range(m):
                sample_index =  self.rng.integers(m)
                Xi = X_b[sample_index: sample_index+1]
                yi = y[sample_index: sample_index+1]

                # l2 penalty 
                l2 = self.alpha*self.theta
                l2[0] = 0
                gradient = Xi.T @ (Xi @ self.theta - yi) + l2

                eta = self.learning_schedule_optimal(epoch * m + iteration + 1)

                self.theta -= eta * gradient

        self.intercept_ = self.theta[0]
        self.coef_ = self.theta[1:]

    def predict(self, X):
        return self.__add_bias(X) @ self.theta

Following are the call methods and additional code:

import numpy as np
import matplotlib.pyplot as plt

X = 6 * np.random.rand(m, 1) - 3
y = 0.25 * X + np.random.randn(m, 1) + 2
    
X_test = np.linspace(-3, 3, m).reshape(m, 1)

plt.plot(X, y, "g.", label="Predictions")
plt.legend()

sgd = SGDRegression1(n_iter=1, eta0=0.01, seed=42, alpha=10)
sgd.fit(X, y)
print(sgd.coef_, sgd.intercept_)

y_pred = sgd.predict(X_test)

plt.plot(X_test, y_pred, "b-")

plt.grid(True)
plt.show()

I tried to change the learning schedule and GD algorithm but it changed nothing.

How is gradient descent calculated in sklearn with offset and l2 regularization?

Implementing sklearn SGD with l2 regularization from scratch

0 Answers0