I'm trying to implement a stochastic gradient descent and it works, as long as the number of sampes are greater than the number of features, otherwise, the loss diverges as seen in the figures, in which i compare the loss to the scikit.learn SGDRegressor.
This is my code:
import math
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
def loss_prime_simple(w,node,feature,data):
x = data[3]
y = data[2]
x_f = x[node][feature]
y_node = y[node]
return -(y_node - w[feature] * x_f) * x_f
def update_weights(w,data,predecs,children,node, learning_rate):
len_features = len(data[3][0])
w_new = np.zeros(len_features)
for feature_ in range(len_features):
w_new[feature_] = loss_prime_simple(w,node,feature_,data)
return w - learning_rate * w_new
def loss_simple2(w,data):
y_p = data[2]
x = data[3]
return ((y_p - np.dot(w,np.array(x).T)) ** 2).sum()
def learn(data,l,features, iterations):
a = []
X = np.array(data[3])
w = np.random.rand(features)
for epoch in range(iterations):
print epoch
for j in range(X.shape[0]):
w = update_weights(w,data,None,None,j, l)
# update learning rate
#learning_rate = learning_rate_0 * (1 + learning_rate_0 * gamma * ( epoch + len(X_standardized) * epoch)) ** (-1)
a.append( loss_simple2(w, data))
return a
fig, ax = plt.subplots(2,2)
X = np.random.randn(1000, 496)
y = np.random.randn(1000)
data = None, None, y, X
l = math.pow(10,-4)
a = learn(data,l,496,200)
ax[0,0].plot(a)
ax[0,0].set_title('my SGD features < samples, learning rate =%s'%l)
b = []
clf = linear_model.SGDRegressor(learning_rate='constant', eta0=l, shuffle=False)
for epoch in range(200):
for x,y_i in zip(X,y):
clf.partial_fit(x,[y_i])
b.append(((y - clf.predict(X))**2).sum())
ax[0,1].plot(b)
ax[0,1].set_title('Sklearn features < samples, learning rate =%s'%l)
X = np.random.randn(10, 496)
y = np.random.randn(10)
data = None, None, y, X
a = learn(data,l,496,1000)
ax[1,0].plot(a)
ax[1,0].set_title('my SGD features > samples, learning rate =%s'%l)
b = []
clf = linear_model.SGDRegressor(learning_rate='constant', eta0=l, shuffle=False)
for epoch in range(1000):
for x,y_i in zip(X,y):
clf.partial_fit(x,[y_i])
b.append(((y - clf.predict(X))**2).sum())
ax[1,1].plot(b)
ax[1,1].set_title('Sklearn features > samples, learning rate =%s'%l)
What am I doing wrong?