Tensorflow multivariate linear regression not converging

Question

I am trying to train a multivariate linear regression model with regularization using tensorflow. For some reason I am not able to get the training piece of the below code to calculate the error I want to use for the gradient descent update. Am I doing something wrong in setting up my graph?

def normalize_data(matrix):
    averages = np.average(matrix,0)
    mins = np.min(matrix,0)
    maxes = np.max(matrix,0)
    ranges = maxes - mins
    return ((matrix - averages)/ranges)

def run_regression(X, Y, X_test, Y_test, lambda_value = 0.1, normalize=False, batch_size=10):
    x_train = normalize_data(X) if normalize else X
    y_train = Y
    x_test = X_test
    y_test = Y_test
    session = tf.Session()

    # Calculate number of features for X and Y
    x_features_length = len(X[0])
    y_features_length = len(Y[0])

    # Build Tensorflow graph parts
    x = tf.placeholder('float', [None, x_features_length], name="X")
    y = tf.placeholder('float', [None, y_features_length], name="Y")
    theta = tf.Variable(tf.random_normal([x_features_length, y_features_length], stddev=0.01), name="Theta")
    lambda_val = tf.constant(lambda_value)

    # Trying to implement this way http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=MachineLearning&doc=exercises/ex5/ex5.html
    y_predicted = tf.matmul(x, theta, name="y_predicted")
    regularization_cost_part = tf.cast(tf.mul(lambda_val,tf.reduce_sum(tf.pow(theta,2)), name="regularization_param"), 'float')
    polynomial_cost_part = tf.reduce_sum(tf.pow(tf.sub(y_predicted, y), 2), name="polynomial_sum")

    # Set up some summary info to debug
    with tf.name_scope('cost') as scope:
        cost_func = tf.mul(tf.cast(1/(2*batch_size), 'float'), tf.cast(tf.add(polynomial_cost_part, regularization_cost_part), 'float'))
        cost_summary = tf.scalar_summary("cost", cost_func)

    training_func = tf.train.GradientDescentOptimizer(0.03).minimize(cost_func)

    with tf.name_scope("test") as scope:
        correct_prediction = tf.sub(tf.cast(1, 'float'), tf.reduce_mean(tf.sub(y_predicted, y)))
        accuracy = tf.cast(correct_prediction, "float")
        accuracy_summary = tf.scalar_summary("accuracy", accuracy)

    saver = tf.train.Saver()
    merged = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter("/tmp/football_logs", session.graph_def)
    init = tf.initialize_all_variables()

    session.run(init)
    for i in range(0, (len(x_train)/batch_size)):
        session.run(training_func, feed_dict={x: x_train[i*batch_size:i*batch_size+batch_size], y: y_train[i*batch_size:i*batch_size+batch_size]})
        if i % batch_size == 0:
            result = session.run([merged, accuracy], feed_dict={x: x_test, y: y_test})
            writer.add_summary(result[0], i)
            print "step %d, training accuracy %g"%(i, result[1])

    print "test accuracy %g"%session.run(accuracy, feed_dict={x: x_test, y: y_test})

    save_path = saver.save(session, "/tmp/football.ckpt")
    print "Model saved in file: ", save_path
    session.close()

my output looks like this

step 0, training accuracy 39.1802
step 10, training accuracy 39.1802
step 20, training accuracy 39.1802
...
step 210, training accuracy 39.1802
test accuracy 39.1802
Model saved in file:  /tmp/football.ckpt

Have you tried playing with the learning rate? Also try using reduce_mean instead of reduce_sum in your cost calculation. In this way the learning rate becomes more independent from the batch_size. Also try adding a bias term to "y_predicted = tf.matmul(x, theta, name="y_predicted");" (why the semicolon here?). Maybe the method is underfitting and thus the high training error. Obs: You should also change the names of your variables to reflect their meaning. — cesarsalgado, Dec 10 '15 at 23:13
I've tried updating the learning rate and using the reduce _mean, but for some reason the accuracy function does not converge. I'm not sure if maybe I have it configured wrong with the variables. I am trying to implement regression with regularization to prevent overfitting as described [in this article](http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=MachineLearning&doc=exercises/ex5/ex5.html) which is where the polynomial_part and regularization_part variable names come from. — JHafdahl, Dec 14 '15 at 14:53
Why does your target (y variables) have features? It should be 1-D array? Also your regularization factor is not being updated anywhere, so it's not dependent on your input. — Kracit, Apr 05 '16 at 16:35
@JHafdahl , Did you find the source of your problem? I suspect I have run into a similar issue :/ — Fredrik Bagge, Nov 03 '16 at 13:46

score 2 · Answer 1 · edited May 23 '17 at 12:03

it seems indeed a problem with the learning rate: 0.03 may be too high depending on how does your data look like. Also, you probably want to create your graph separated from the session in a more explicit way, or even use the normal equations to reach the optimal solution without having to iterate, if your dataset has a mid/low dimensionality. Here I posted some examples that you may hopefully find helpful! Also, the TF tutorials cover it well (search for "Complete program" in that page).

But regarding your code, here is a version that worked for me: I changed some deprecated functions, and basically set the learning rate to the much lower value alpha=1e-8, which (on the synthetic dataset also generated in the code) seems to converge:

test accuracy 2176.11
test accuracy 1898.6
test accuracy 1663.69
test accuracy 1458.53
test accuracy 1287.57
test accuracy 1116.9
test accuracy 969.474
test accuracy 841.028
test accuracy 738.592
test accuracy 649.891
test accuracy 565.188
test accuracy 495.33
test accuracy 438.351
test accuracy 381.161
test accuracy 333.213
test accuracy 289.575
test accuracy 254.394
test accuracy 222.836
test accuracy 197.36
test accuracy 172.788
test accuracy 152.251
test accuracy 132.664
test accuracy 115.982
test accuracy 101.021
final test accuracy 90.2555

CODE:

import tensorflow as tf
import numpy as np


# generate some dataset
DIMENSIONS = 5
DS_SIZE = 5000
TRAIN_RATIO = 0.5 # 50% of the dataset isused for training
_train_size = int(DS_SIZE*TRAIN_RATIO)
_test_size = DS_SIZE - _train_size
f = lambda(x): sum(x) # the "true" function: f = 0 + 1*x1 + 1*x2 + 1*x3 ...
noise = lambda: np.random.normal(0,10) # some noise
# training globals
LAMBDA = 1e6 # L2 regularization factor
# generate the dataset, the labels and split into train/test
ds = [[np.random.rand()*1000 for d in range(DIMENSIONS)] for _ in range(DS_SIZE)]
ds = [([1]+x, [f(x)+noise()]) for x in ds] # add x[0]=1 dimension and labels
np.random.shuffle(ds)
train_data, train_labels = zip(*ds[0:_train_size])
test_data, test_labels = zip(*ds[_train_size:])



def normalize_data(matrix):
    averages = np.average(matrix,0)
    mins = np.min(matrix,0)
    maxes = np.max(matrix,0)
    ranges = maxes - mins
    return ((matrix - averages)/ranges)

def run_regression(X, Y, X_test, Y_test, lambda_value = 0.1, normalize=False, batch_size=10, alpha=1e-8):
    x_train = normalize_data(X) if normalize else X
    y_train = Y
    x_test = X_test
    y_test = Y_test
    session = tf.Session()
    # Calculate number of features for X and Y
    x_features_length = len(X[0])
    y_features_length = len(Y[0])
    # Build Tensorflow graph parts
    x = tf.placeholder('float', [None, x_features_length], name="X")
    y = tf.placeholder('float', [None, y_features_length], name="Y")
    theta = tf.Variable(tf.random_normal([x_features_length, y_features_length], stddev=0.01), name="Theta")
    lambda_val = tf.constant(lambda_value)
    # Trying to implement this way http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=MachineLearning&doc=exercises/ex5/ex5.html
    y_predicted = tf.matmul(x, theta, name="y_predicted")
    #regularization_cost_part = tf.cast(tf.multiply(lambda_val,tf.reduce_sum(tf.pow(theta,2)), name="regularization_param"), 'float')
    #polynomial_cost_part = tf.reduce_sum(tf.pow(tf.subtract(y_predicted, y), 2), name="polynomial_sum")
    # Set up some summary info to debug
    with tf.name_scope('cost') as scope:
        #cost_func = tf.multiply(tf.cast(1/(2*batch_size), 'float'), tf.cast(tf.add(polynomial_cost_part, regularization_cost_part), 'float'))
        cost_func = (tf.nn.l2_loss(y_predicted - y)+lambda_val*tf.nn.l2_loss(theta))/float(batch_size)
        #DEPRECATED*** cost_summary = tf.scalar_summary("cost", cost_func)
        cost_summary = tf.summary.scalar('cost', cost_func)# Add a scalar summary for the snapshot loss.
    training_func = tf.train.GradientDescentOptimizer(alpha).minimize(cost_func)
    with tf.name_scope("test") as scope:
        correct_prediction = tf.subtract(tf.cast(1, 'float'), tf.reduce_mean(tf.subtract(y_predicted, y)))
        accuracy = tf.cast(correct_prediction, "float")
        #DEPRECATED*** accuracy_summary = tf.scalar_summary("accuracy", accuracy)
        #accuracy_summary = tf.summary.scalar("accuracy", accuracy)
    saver = tf.train.Saver()
    #DEPRECATED*** merged = tf.merge_all_summaries()
    merged = tf.summary.merge_all()
    #DEPRECATED*** writer = tf.train.SummaryWriter("/tmp/football_logs", session.graph_def)
    writer = tf.summary.FileWriter("/tmp/football_logs", session.graph)
    #DEPRECATED*** init = tf.initialize_all_variables()
    init = tf.global_variables_initializer()
    session.run(init)
    for i in range(1, (len(x_train)/batch_size)):
        session.run(training_func, feed_dict={x: x_train[i*batch_size:i*batch_size+batch_size], y: y_train[i*batch_size:i*batch_size+batch_size]})
        if i % batch_size == 0:
            print "test accuracy %g"%session.run(accuracy, feed_dict={x: x_test, y: y_test})
            #result = session.run([merged, accuracy], feed_dict={x: x_test, y: y_test})
            # writer.add_summary(result[0], i)
            # print "step %d, training accuracy %g"%(i, result[1])
            #writer.flush()
    print "final test accuracy %g"%session.run(accuracy, feed_dict={x: x_test, y: y_test})
    # save_path = saver.save(session, "/tmp/football.ckpt")
    # print "Model saved in file: ", save_path
    session.close()

run_regression(train_data, train_labels, test_data, test_labels, normalize=False, alpha=1e-8)

As I said, you will probably want to change the structure to favor readability and scalability, but hopefully this helps!

Cheers, Andres

Tensorflow multivariate linear regression not converging

1 Answers1

CODE:

Linked