As a part of my homework I was asked to implement a stochastic gradient descent in order to solve a linear regression problem (even though I have only 200 training examples). My problem is that stochastic gradient descent converges too smoothly, almost exactly as batch gradient descent, which brings me to my question: why does it look so smoothly, considering the fact that usually it's much more noisy. Is it because I use it with only 200 examples?
Convergence plots:
MSE with weights from stochastic gradient descent: 2.78441258841
MSE with weights from gradient descent: 2.78412631451 (identical to MSE with weights from normal equation)
My code:
def mserror(y, y_pred):
n = y.size
diff = y - y_pred
diff_squared = diff ** 2
av_er = float(sum(diff_squared))/n
return av_er
.
def linear_prediction(X, w):
return dot(X,np.transpose(w))
.
def gradient_descent_step(X, y, w, eta):
n = X.shape[0]
grad = (2.0/n) * sum(np.transpose(X) * (linear_prediction(X,w) - y), axis = 1)
return w - eta * grad
.
def stochastic_gradient_step(X, y, w, train_ind, eta):
n = X.shape[0]
grad = (2.0/n) * np.transpose(X[train_ind]) * (linear_prediction(X[train_ind],w) - y[train_ind])
return w - eta * grad
.
def gradient_descent(X, y, w_init, eta, max_iter):
w = w_init
errors = []
errors.append(mserror(y, linear_prediction(X,w)))
for i in range(max_iter):
w = gradient_descent_step(X, y, w, eta)
errors.append(mserror(y, linear_prediction(X,w)))
return w, errors
.
def stochastic_gradient_descent(X, y, w_init, eta, max_iter):
n = X.shape[0]
w = w_init
errors = []
errors.append(mserror(y, linear_prediction(X,w)))
for i in range(max_iter):
random_ind = np.random.randint(n)
w = stochastic_gradient_step(X, y, w, random_ind, eta)
errors.append(mserror(y, linear_prediction(X,w)))
return w, errors