0

I need some help or an idea about what's going wrong in my code. I am trying to implement the SGD regressor using l2, but the bias in my model reaches very high value (when alpha is above 10). I think something is wrong with gradient and its calculations.

This is my SGD class:

class SGDRegression1():
    def __init__(self, n_iter, eta0, seed = None, alpha = None) -> None:
        self.n_iter = n_iter
        self.eta0 = eta0
        self.alpha = alpha

        if seed is not None:
            self.rng = np.random.default_rng(seed=seed)


    def __add_bias(self, X):
        return np.c_[np.ones((X.T.shape[-1],1)), X]
  
    def learning_schedule_optimal(self, t):

        return 1 / (t + self.eta0)

    def fit(self, X:np.ndarray, y):
        X_b = self.__add_bias(X)
        
        self.theta = self.rng.standard_normal((X_b.shape[1], 1))

        m = len(X_b)
        for epoch in range(self.n_iter):
            for iteration in range(m):
                sample_index =  self.rng.integers(m)
                Xi = X_b[sample_index: sample_index+1]
                yi = y[sample_index: sample_index+1]

                # l2 penalty 
                l2 = self.alpha*self.theta
                l2[0] = 0
                gradient = Xi.T @ (Xi @ self.theta - yi) + l2

                eta = self.learning_schedule_optimal(epoch * m + iteration + 1)

                self.theta -= eta * gradient

        self.intercept_ = self.theta[0]
        self.coef_ = self.theta[1:]

    def predict(self, X):
        return self.__add_bias(X) @ self.theta

Following are the call methods and additional code:

import numpy as np
import matplotlib.pyplot as plt

X = 6 * np.random.rand(m, 1) - 3
y = 0.25 * X + np.random.randn(m, 1) + 2
    
X_test = np.linspace(-3, 3, m).reshape(m, 1)

plt.plot(X, y, "g.", label="Predictions")
plt.legend()

sgd = SGDRegression1(n_iter=1, eta0=0.01, seed=42, alpha=10)
sgd.fit(X, y)
print(sgd.coef_, sgd.intercept_)

y_pred = sgd.predict(X_test)

plt.plot(X_test, y_pred, "b-")

plt.grid(True)
plt.show()

I tried to change the learning schedule and GD algorithm but it changed nothing.

How is gradient descent calculated in sklearn with offset and l2 regularization?

desertnaut
  • 57,590
  • 26
  • 140
  • 166
Vadim
  • 1
  • 2

0 Answers0