I'm trying to build a multiple linear regression model for boston
dataset in scikit-learn.
I use Stochastic Gradient Descent (SGD) to optimize the model. And it seems like I have to use very small learning rate(0.000000001) to make model learn. If I use bigger learning rate, the model fails to learn and diverges to NaN or inf.
So, here's my questions:
- Is it okay to use such small learning rate? Or is there any problem in my code below?
- It seems like the loss of validation dataset decreases, increases a while, and then decreases again. Is this the case that my model fall into overfitting problem, but luckily escaped due to the power of SGD's instability compare to Batch Gradient Descent method?
Here's my code:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
def loss(x, y, w):
predict_y = x @ w
return np.sqrt(np.mean(np.square((y - predict_y))))
def status(w):
w_ = np.squeeze(w)
print("w = [", end="")
for i in range(14):
if(i == 13):
print(w_[i], end="]")
else:
print(w_[i], end=", ")
print()
training_loss = loss(training_x, training_y, w)
validation_loss = loss(validation_x, validation_y, w)
print("Training Loss = " + str(training_loss))
print("Validation Loss = " + str(validation_loss))
training_predict_y = training_x @ w
validation_predict_y = validation_x @ w
print("{:^40s}|{:^40s}".format("training", "validation"))
print("{:^20s}{:^20s}|{:^20s}{:^20s}".format("predict_y", "true_y", "predict_y", "true_y"))
for i in range(10):
print("{:^20f}{:^20f}|{:^20f}{:^20f}".format(float(training_predict_y[i]), float(training_y[i]), float(validation_predict_y[i]), float(validation_y[i])))
print()
def plot(title, data):
plt.title(title)
plt.plot(range(len(data)), data)
plt.savefig(title + ".png", dpi = 300)
plt.show()
np.random.seed(2020) # for consistency
# data
dataset = datasets.load_boston()
x = dataset.data
y = dataset.target
# reformat the data
x_ = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1) # x0 = 1인 열 추가
y_ = np.expand_dims(y, axis=1)
# divide data into training set and validation set
training_x = x_[ 0:406, : ]
training_y = y_[ 0:406, : ]
validation_x = x_[ 406:506, : ]
validation_y = y_[ 406:506, : ]
# initialize w
w = np.random.rand(x_.shape[1], 1)
print("Before Training...")
status(w)
# hyperparameter
epochs = 100000
lr = 0.000000001
training_losses = []
validation_losses = []
data_num = training_x.shape[0]
for epoch in range(epochs):
for i in range(data_num):
sample = training_x[ i:i + 1, : ]
true_y = training_y[ i:i + 1, : ]
predict_y = sample @ w
# calculate gradient
gradient = -(2 / sample.shape[0]) * sample.T @ (true_y - predict_y)
# update w
w = w - lr * gradient
training_loss = loss(training_x, training_y, w)
validation_loss = loss(validation_x, validation_y, w)
training_losses.append(training_loss)
validation_losses.append(validation_loss)
print("After Training...")
status(w)
plot("Training Loss - SGD", training_losses)
plot("Validation Loss - SGD", validation_losses)