Did my model failes to learn due to "Bad loss function design" or "Mistake in training loop"?

Question

I'm trying to build a siamese neural network for human pose estimation based on Hash matching.

The basic concept and references are described in my previous post

I fixed the "no gradients provided for any variable" error but found that my model's loss value was not improving no matter how long the training was.

I saved checkpoints on 100th,10000th and 500000th iteration and the restored models have the same magnitude of loss value.

Currently I consider there might be two causes of this problem:

1.Bad loss function design:

The design was borrowed from Openpose, which is the difference between "distance map" and the "label map".

Unlike the openpose project, the "distance map" was obtained by computing Hamming distances to the "Hint Hash". This process contain lots of unconventional operations and binarization, which may cause the model to be not trainable.

2.Wrong traing loop design:

During the training, I found every saved checkpoint files were named "XXXX.ckpt.data-00000-of-00001", the XXXX part will change, but the suffix "00000-of-00001"kept reappearing.

I suspect there was something wrong in my model or training loop and caused it to repeat doing the first step of training?

I'm still trying to find resources to fix this problem, any of your comment may help me a lot!

Here's the code:

import tensorflow as tf
import numpy as np
import time
from imageLoader import getPaddedROI,training_data_feeder
import math
import cv2

tf.reset_default_graph()

def truncated_normal_var(name,shape,dtype):
    return(tf.get_variable(name=name, shape=shape, dtype=dtype,     initializer=tf.truncated_normal_initializer(stddev=0.01)))
def zero_var(name,shape,dtype):
    return(tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.constant_initializer(0.0)))

roi_size = 23
image_input_size = 301

#input placeholders
#batch1 hints
inputs_b1h1 = tf.placeholder(tf.float32, ( 16, roi_size, roi_size, 3), name='inputs_b1h1')
#inputs_b1h2 = tf.placeholder(tf.float32, ( 16, roi_size, roi_size, 3), name='inputs_b1h2')

inputs_s = tf.placeholder(tf.float32, (None, image_input_size, image_input_size, 3), name='inputs_s')
labels = tf.placeholder(tf.float32,(16,76,76), name='labels')

#define the model

def paraNet(inputs, inputs_s , ground_truth_labels ):
    with tf.variable_scope('conv'):
        out_l1 = tf.layers.conv2d(inputs, 16, [3, 3],strides=(2, 2), padding ='valid' ,name='para_conv_1')
        out_l1r = tf.nn.relu(out_l1)
        out_l2 = tf.layers.conv2d(out_l1r, 48, [3, 3],strides=(2, 2), padding ='valid' ,name='para_conv_2')
        out_l2r = tf.nn.relu(out_l2)
        out_l3 = tf.layers.conv2d(out_l2r, 96, [5, 5],strides=(1, 1), padding ='valid' ,name='para_conv_3')
        out_l3r = tf.nn.relu(out_l3)
        out_l4 = tf.layers.conv2d(out_l3r, 32, [1, 1],strides=(1, 1), padding ='valid' ,name='para_conv_4')
        hint = tf.squeeze(  tf.sign( tf.sigmoid(out_l4) ) )

    with tf.variable_scope('conv', reuse=tf.AUTO_REUSE ):
        out_2_l1 = tf.layers.conv2d(inputs_s,  16, [3, 3],strides=(2, 2), padding ='same' ,name='para_conv_1')
        out_2_l1r = tf.nn.relu(out_2_l1)
        out_2_l2 = tf.layers.conv2d(out_2_l1r, 48, [3, 3],strides=(2, 2), padding ='same' ,name='para_conv_2')
        out_2_l2r = tf.nn.relu(out_2_l2)
        out_2_l3 = tf.layers.conv2d(out_2_l2r, 96, [5, 5],strides=(1, 1), padding ='same' ,name='para_conv_3')
        out_2_l3r = tf.nn.relu(out_2_l3)
        out_2_l4 = tf.layers.conv2d(out_2_l3r, 32, [1, 1],strides=(1, 1), padding ='same' ,name='para_conv_4')
        sample =tf.sign( tf.sigmoid(out_2_l4))

    map0 = tf.reduce_sum ( tf.abs (tf.subtract( hint[0] , sample ) ) , axis=3 )  
    map1 = tf.reduce_sum ( tf.abs (tf.subtract( hint[1] , sample ) ) , axis=3 )  
    map2 = tf.reduce_sum ( tf.abs (tf.subtract( hint[2] , sample ) ) , axis=3 )  
    map3 = tf.reduce_sum ( tf.abs (tf.subtract( hint[3] , sample ) ) , axis=3 )  
    map4 = tf.reduce_sum ( tf.abs (tf.subtract( hint[4] , sample ) ) , axis=3 )  
    map5 = tf.reduce_sum ( tf.abs (tf.subtract( hint[5] , sample ) ) , axis=3 )  
    map6 = tf.reduce_sum ( tf.abs (tf.subtract( hint[6] , sample ) ) , axis=3 )  
    map7 = tf.reduce_sum ( tf.abs (tf.subtract( hint[7] , sample ) ) , axis=3 )  
    map8 = tf.reduce_sum ( tf.abs (tf.subtract( hint[8] , sample ) ) , axis=3 )  
    map9 = tf.reduce_sum ( tf.abs (tf.subtract( hint[9] , sample ) ) , axis=3 )  
    map10 = tf.reduce_sum ( tf.abs (tf.subtract( hint[10] , sample ) ) , axis=3 )  
    map11 = tf.reduce_sum ( tf.abs (tf.subtract( hint[11] , sample ) ) , axis=3 )  
    map12 = tf.reduce_sum ( tf.abs (tf.subtract( hint[12] , sample ) ) , axis=3 )  
    map13 = tf.reduce_sum ( tf.abs (tf.subtract( hint[13] , sample ) ) , axis=3 )  
    map14 = tf.reduce_sum ( tf.abs (tf.subtract( hint[14] , sample ) ) , axis=3 )  
    map15 = tf.reduce_sum ( tf.abs (tf.subtract( hint[15] , sample ) ) , axis=3 )  

    totoal_map =tf.div( tf.concat([map0, map1, map2, map3, map4, map5, map6, map7,
                               map8, map9, map10,map11,map12, map13, map14, map15], 0) , 64)
    loss = tf.nn.l2_loss( totoal_map -  ground_truth_labels , name = 'loss'  )

    return loss, totoal_map

loss, totoal_map = paraNet(inputs_b1h1, inputs_s, labels)
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

init =  tf.global_variables_initializer()

saver = tf.train.Saver()

with tf.Session() as sess:    
    #writer = tf.summary.FileWriter("./variable_graph",graph = sess.graph)
    sess.run(init)

    #load image from dataset(train set)
    joint_data_path = "./custom_data.json"
    train_val_path = "./train_val_indices.json"
    imgpath = "./000/"
    input_size = 301
    hint_roi_size = 23

    #saver.restore(sess, "./temp_model/model5.ckpt")


    for i in range(5000):

        #load data
        hintSet01,hintSet02,t_img,t_label_norm = training_data_feeder(joint_data_path, train_val_path, imgpath, input_size, hint_roi_size )
        #Normalize the image pixel values to 0~1
        hintSet01_norm = []
        hintSet02_norm = []

        t_img =[ np.float32(t_img /255.0) ]

        for rois in hintSet01:
            tmp = np.float32(rois / 255.0)
            hintSet01_norm.append(tmp.tolist())
        for rois in hintSet02:
            tmp = np.float32(rois / 255.0)
            hintSet02_norm.append(tmp.tolist())
        loss_val, _ = sess.run([loss, train_step] , 
                      feed_dict = {inputs_s:  t_img, 
                                   inputs_b1h1: hintSet01_norm, 
                                   labels: t_label_norm })
        if i % 50 == 0:
            print(loss_val)

    #save_path = saver.save(sess, "./temp_model/model" + '5' + ".ckpt")

Here is the github repo and the dataset links on github here.

2018.11.9 Update:

I borrowed a binarization method from "Binarized Neural Network".

Here's the code:

def ste_binarize( value ):
"""
Clip and binarize tensor using the straight through estimator (STE) for the gradient.
The gradient of tf.sign(x) will always be zero thus it use:
g.gradient_override_map({"Sign" : "Identity"}) 
This will replace the gradient of tf.sign(x) with the original gradient of x(which is the Identity). 
"""
g = tf.get_default_graph()

with ops.name_scope("Binarized") as name:
    with g.gradient_override_map({"Sign" : "Identity"}):
        return tf.sign(value)

This method works well in the original project on Github I replaced the original tf.sign() with this method and tried training again. Sadly, it didn't work on my case. I'll change the structure of my model and try to record the gradient value. If I make any progress, I'll update this post.

2018.11.22 Update: I changed the model structure and trained it again. This time I discovered a strange behavior of the loss value: While training, the loss value was actually going up and freeze at a certain point.

Here's the new model:

import tensorflow as tf

from tensorflow.python.framework import ops

from tensorflow.python.platform import gfile


from progress.bar import Bar

import numpy as np
import time
from datetime import datetime
import math
import matplotlib.pyplot as plt
import cv2

from imageLoader import getPaddedROI,training_batch_generator

tf.reset_default_graph()



#load image from dataset(train set)
joint_data_path = "./custom_data.json"
train_val_path = "./train_val_indices.json"
imgpath = "./000/"
input_size = 301
hint_roi_size = 23

roi_size = 23
image_input_size = 301
batch_number =10

def truncated_normal_var(name,shape,dtype):
    return(tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.truncated_normal_initializer(stddev=0.01)))
def zero_var(name,shape,dtype):
    return(tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.constant_initializer(0.0)))

def ste_binarize( value ):
"""
Clip and binarize tensor using the straight through estimator (STE) for the gradient.
The gradient of tf.sign(x) will always be zero so it use:
g.gradient_override_map({"Sign" : "Identity"}) 
This will replace the gradient of tf.sign(x) with the original gradient of x(which is the Identity). 
"""
    g = tf.get_default_graph()

    with ops.name_scope("Binarized") as name:
        with g.gradient_override_map({"Sign" : "Identity"}):
            return tf.sign(value)

#define the model
def paraNet(hint_inputs, sample_inputs):
    def paraConv(inputs):
        out_l1 = tf.layers.conv2d(inputs , 16, [3, 3],strides=(2, 2), padding ='valid' ,name='para_conv_1')
        out_l1r = tf.nn.relu(out_l1)
        out_l2 = tf.layers.conv2d(out_l1r, 48, [3, 3],strides=(2, 2), padding ='valid' ,name='para_conv_2')
        out_l2r = tf.nn.relu(out_l2)
        out_l3 = tf.layers.conv2d(out_l2r, 96, [5, 5],strides=(1, 1), padding ='valid' ,name='para_conv_3')
        out_l3r = tf.nn.relu(out_l3)
        out_l4 = tf.layers.conv2d(out_l3r, 32, [1, 1],strides=(1, 1), padding ='valid' ,name='para_conv_4')
        hint =tf.reshape( tf.squeeze(  ste_binarize( out_l4 ) ), [batch_number, 1 , 1 , 32] )
        return hint

with tf.variable_scope('conv'):
    hint00 = paraConv(hint_inputs[:,0,:,:,:])
with tf.variable_scope('conv', reuse= True ):
    hint01 = paraConv(hint_inputs[:,1,:,:,:])
    hint02 = paraConv(hint_inputs[:,2,:,:,:])
    hint03 = paraConv(hint_inputs[:,3,:,:,:])
    hint04 = paraConv(hint_inputs[:,4,:,:,:])
    hint05 = paraConv(hint_inputs[:,5,:,:,:])
    hint06 = paraConv(hint_inputs[:,6,:,:,:])
    hint07 = paraConv(hint_inputs[:,7,:,:,:])
    hint08 = paraConv(hint_inputs[:,8,:,:,:])
    hint09 = paraConv(hint_inputs[:,9,:,:,:])
    hint10 = paraConv(hint_inputs[:,10,:,:,:])
    hint11 = paraConv(hint_inputs[:,11,:,:,:])
    hint12 = paraConv(hint_inputs[:,12,:,:,:])
    hint13 = paraConv(hint_inputs[:,13,:,:,:])
    hint14 = paraConv(hint_inputs[:,14,:,:,:])
    hint15 = paraConv(hint_inputs[:,15,:,:,:]) 

    out_2_l1 = tf.layers.conv2d(sample_inputs,  16, [3, 3],strides=(2, 2), padding ='same' ,name='para_conv_1')
    out_2_l1r = tf.nn.relu(out_2_l1)
    out_2_l2 = tf.layers.conv2d(out_2_l1r, 48, [3, 3],strides=(2, 2), padding ='same' ,name='para_conv_2')
    out_2_l2r = tf.nn.relu(out_2_l2)
    out_2_l3 = tf.layers.conv2d(out_2_l2r, 96, [5, 5],strides=(1, 1), padding ='same' ,name='para_conv_3')
    out_2_l3r = tf.nn.relu(out_2_l3)
    out_2_l4 = tf.layers.conv2d(out_2_l3r, 32, [1, 1],strides=(1, 1), padding ='same' ,name='para_conv_4')
    #sample =tf.sign( tf.sigmoid(out_2_l4))
    sample =ste_binarize( out_2_l4)

    #originalMap = tf.reduce_sum ( tf.abs (tf.subtract( hint00 , sample ) ) , axis=3 )

    map0 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint00 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )
    map1 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint01 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map2 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint02 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map3 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint03 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map4 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint04 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map5 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint05 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map6 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint06 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map7 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint07 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map8 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint08 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map9 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint09 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map10 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint10 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map11 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint11 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map12 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint12 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map13 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint13 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map14 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint14 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  
    map15 = tf.reshape( tf.reduce_sum ( tf.abs (tf.subtract( hint15 , sample ) ) , axis=3 ), [batch_number, 1, 76, 76]  )  

    totoal_map =tf.div( tf.concat([map0, map1, map2, map3, map4, map5, map6, map7,
                               map8, map9, map10,map11,map12, map13, map14, map15], 1) , 32)
return totoal_map




inputs_b1h1 = tf.placeholder(tf.float32, ( None, 16, roi_size, roi_size, 3), name='inputs_b1h1')
inputs_s = tf.placeholder(tf.float32, (None, image_input_size, image_input_size, 3), name='inputs_s')
ground_truth_labels = tf.placeholder(tf.float32,(None, 16,76,76), name='labels')

mtotoal_map = paraNet(inputs_b1h1 , inputs_s )
mloss = tf.nn.l2_loss( ground_truth_labels -  mtotoal_map, name = 'loss'  )
train_step = tf.train.GradientDescentOptimizer(0.0001).minimize(mloss)

init =  tf.global_variables_initializer()

saver = tf.train.Saver()

with tf.Session() as sess:
    #writer = tf.summary.FileWriter("./variable_graph",graph = sess.graph)
    sess.run(init)
    #saver.restore(sess, "./temp_model/model7.ckpt")

    loss_per_iteration = []
    for i in range(100):

        #load data
        hintSet01,hintSet02,t_img,t_label_norm = training_batch_generator(joint_data_path, train_val_path, imgpath, input_size, hint_roi_size,batch_number)

        loss_val, _ = sess.run([mloss, train_step] , 
                      feed_dict = {inputs_s:  t_img, 
                                   inputs_b1h1: hintSet01, 
                                   ground_truth_labels: t_label_norm })
        loss_per_iteration.append(loss_val)
        if i % 5 == 0:
            print(loss_val)

    plt.plot(loss_per_iteration)
    plt.show()
    save_path = saver.save(sess, "./temp_model/model" + '7' + ".ckpt")

One issue is that you're taking the sign of the sigmoid which is always equal to 1, since sigmoid is always positive (except for quantization error). Another effect of using sign is that it's derivative is zero everywhere, so your gradient will always be zero and your model will never be updated. — Cory Nezin, Nov 04 '18 at 18:39
So in fact I didn't fix my previous problem(Which I thought was caused by tf.round and replaced with tf.sign and tf.sigmoid)! I wonder if there's a feasible way to perform a "Hashing trick" on the final output of a neural network... — pharmboy, Nov 05 '18 at 09:49

Did my model failes to learn due to "Bad loss function design" or "Mistake in training loop"?

0 Answers0