SGD does not converge if #samples < #features

Question

I'm trying to implement a stochastic gradient descent and it works, as long as the number of sampes are greater than the number of features, otherwise, the loss diverges as seen in the figures, in which i compare the loss to the scikit.learn SGDRegressor.

This is my code:

import math
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model

def loss_prime_simple(w,node,feature,data):
   x = data[3]
   y = data[2]
   x_f = x[node][feature]
   y_node = y[node]
   return -(y_node - w[feature] * x_f) * x_f

def update_weights(w,data,predecs,children,node, learning_rate):
   len_features = len(data[3][0])
   w_new = np.zeros(len_features)
   for feature_ in range(len_features):
      w_new[feature_] = loss_prime_simple(w,node,feature_,data)
   return w - learning_rate * w_new

def loss_simple2(w,data):
   y_p = data[2]
   x = data[3]
   return ((y_p - np.dot(w,np.array(x).T)) ** 2).sum()

def learn(data,l,features, iterations):
    a = []
    X = np.array(data[3])
    w = np.random.rand(features)
    for epoch in range(iterations):
        print epoch
        for j in range(X.shape[0]):
            w = update_weights(w,data,None,None,j, l)
            # update learning rate
            #learning_rate = learning_rate_0 * (1 + learning_rate_0 * gamma * ( epoch + len(X_standardized) * epoch)) ** (-1)
        a.append( loss_simple2(w, data))
    return a

fig, ax = plt.subplots(2,2)
X = np.random.randn(1000, 496)
y = np.random.randn(1000)
data = None, None, y, X
l = math.pow(10,-4)
a = learn(data,l,496,200)
ax[0,0].plot(a)
ax[0,0].set_title('my SGD features < samples, learning rate =%s'%l)

b = []
clf = linear_model.SGDRegressor(learning_rate='constant', eta0=l, shuffle=False)
for epoch in range(200):
    for x,y_i in zip(X,y):
        clf.partial_fit(x,[y_i])
    b.append(((y - clf.predict(X))**2).sum())
ax[0,1].plot(b)
ax[0,1].set_title('Sklearn features < samples, learning rate =%s'%l)

X = np.random.randn(10, 496)
y = np.random.randn(10)
data = None, None, y, X
a = learn(data,l,496,1000)
ax[1,0].plot(a)
ax[1,0].set_title('my SGD features > samples, learning rate =%s'%l)

b = []
clf = linear_model.SGDRegressor(learning_rate='constant', eta0=l, shuffle=False)
for epoch in range(1000):
    for x,y_i in zip(X,y):
        clf.partial_fit(x,[y_i])
    b.append(((y - clf.predict(X))**2).sum())
ax[1,1].plot(b)
ax[1,1].set_title('Sklearn features > samples, learning rate =%s'%l)

What am I doing wrong?

SGD does not converge if #samples < #features

0 Answers0