4

I have been trying to replicate the alex graves handwriting synthesis model, and I did this with tensorflow, and python on a 1080Ti GPU with cuda,

I exactly replicated all of the features explained in the paper and even clipped the respective gradient values in place, but I have real difficulty training it.

I also preproccessed the data in the way explained in the paper, including normalizing the X and y offsets, but the problem is that the training usually can't lower the negative log likelihood more than 1000 which in the paper it reaches -1000, and after that i see NaN weights.

The only extra thing I did was to add 0.0000001 to the conditional probability of every stroke to prevent NaN values in log likelihood.

Any tips or suggestions or experience with such a task?

this is the cell code I use,

class Custom_Cell(RNNCell):

def __init__(self,forget_bias,bias,one_hot_vector, hidden_layer_nums=[700,700,700], mixture_num=10, attention_num=4):
    self.bias = bias
    self.lstms = []
    for i in hidden_layer_nums:
        self.lstms.append(LSTMCell(num_units=i, initializer=tf.truncated_normal_initializer(0.075), dtype=tf.float32, forget_bias=forget_bias))
    self.attention_num = attention_num
    self.mixture_num = mixture_num
    self.state_size = 2*sum(hidden_layer_nums) + 3*self.attention_num
    self.attention_var_num = 3*self.attention_num
    self.output_size = 6*self.mixture_num + 1 + 1
    self.one_hot_vector = one_hot_vector
    self.lstm_num = len(hidden_layer_nums)
    self.hidden_layer_nums = hidden_layer_nums
    temp_shape = self.one_hot_vector.shape
    self.char_num = temp_shape[2]
    self.i_to_h = []
    self.w_to_h = []
    self.h_to_h = []
    self.prev_h_to_h = []
    self.lstm_bias = []
    self.lstm_to_attention_weights = tf.get_variable("lstms/first_to_attention_mtrx",shape=[hidden_layer_nums[0],self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
    self.lstm_to_attention_bias = tf.get_variable("lstms/first_to_attention_bias",shape=[self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
    self.all_to_output_mtrx = []
    for i in range(self.lstm_num):
        self.all_to_output_mtrx.append( tf.get_variable("lstms/to_output_mtrx_" + str(i), shape=[hidden_layer_nums[i],self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
    self.all_to_output_bias = tf.get_variable("lstms/output_bias",shape=[self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
    for i in range(self.lstm_num):
        self.i_to_h.append(tf.get_variable("lstms/i_to_h_"+str(i),shape=[3,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
        self.w_to_h.append(tf.get_variable("lstms/w_to_h_"+str(i),shape=[self.char_num,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
        self.h_to_h.append(tf.get_variable("lstms/h_to_h_"+str(i),shape=[hidden_layer_nums[i],hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
        self.lstm_bias.append(tf.get_variable("lstms/bias_" + str(i),shape=[hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
        if not i == 0:
            self.prev_h_to_h.append(
                tf.get_variable("lstms/prev_h_to_h_" + str(i), shape=[hidden_layer_nums[i-1], hidden_layer_nums[i]],
                                dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.075),
                                trainable=True))

def __call__(self, inputs, state, scope=None):
    # Extracting previous configuration and vectors
    splitarray = []
    for i in self.hidden_layer_nums:
        splitarray.append(i)
        splitarray.append(i)
    splitarray.append(3*self.attention_num)
    splitted = tf.split(state,splitarray,axis=1)
    prev_tuples = []
    for i in range(self.lstm_num):
        newtuple = LSTMStateTuple(splitted[2*i],splitted[2*i + 1])
        prev_tuples.append(newtuple)
    prev_attention_vec = splitted[2*self.lstm_num]
    new_attention_vec = 0
    next_states = []
    most_attended = 0
    last_output = 0
    for i in range(self.lstm_num):
        prev_c, prev_h = prev_tuples[i]
        cell = self.lstms[i]
        if i == 0:
            with tf.name_scope("layer_1"):
                w, most_attended = self.gaussian_attention(self.one_hot_vector,prev_attention_vec)
                input_vec = tf.matmul(inputs,self.i_to_h[0]) + tf.matmul(prev_h,self.h_to_h[0]) + tf.matmul(w,self.w_to_h[0]) + self.lstm_bias[0]
                _, new_state = cell(input_vec, prev_tuples[0])
                new_c, new_h = new_state
                next_states.append(new_c)
                next_states.append(new_h)
                last_output = tf.matmul(new_h,self.all_to_output_mtrx[0])
                with tf.name_scope("attention_layer"):
                    temp_attention = tf.matmul(new_h,self.lstm_to_attention_weights) + self.lstm_to_attention_bias
                    new_alpha, new_beta, new_kappa = tf.split(temp_attention,[self.attention_num,self.attention_num,self.attention_num],axis=1)
                    old_alpha, old_beta, old_kappa = tf.split(prev_attention_vec,[self.attention_num,self.attention_num,self.attention_num], axis=1)
                    new_alpha = tf.exp(new_alpha)
                    new_beta = tf.exp(new_beta)
                    new_kappa = tf.exp(new_kappa) + old_kappa
                    new_attention_vec = tf.concat([new_alpha,new_beta,new_kappa],axis=1)

        else:
            with tf.name_scope("layer_" + str(i)):
                w, most_attended = self.gaussian_attention(self.one_hot_vector,new_attention_vec)
                input_vec = tf.matmul(inputs,self.i_to_h[i]) + tf.matmul(next_states[-1],self.prev_h_to_h[i-1]) + tf.matmul(prev_h,self.h_to_h[i]) + tf.matmul(w,self.w_to_h[i]) + self.lstm_bias[i]
                _,new_state = cell(input_vec,prev_tuples[i])
                new_c, new_h = new_state
                next_states.append(new_c)
                next_states.append(new_h)
                last_output = last_output + tf.matmul(new_h, self.all_to_output_mtrx[i])
    with tf.name_scope("output"):
        last_output = last_output + self.all_to_output_bias
        next_states.append(new_attention_vec)
        state_to_return = tf.concat(next_states,axis=1)
        output_split_param = [1,self.mixture_num,2*self.mixture_num,2*self.mixture_num,self.mixture_num]
        binomial_param, pi, mu, sigma, rho = tf.split(last_output,output_split_param,axis=1)
        binomial_param = tf.divide(1.,1.+tf.exp(binomial_param))
        pi = tf.nn.softmax(tf.multiply(pi,1.+self.bias),axis=1)
        mu = mu
        sigma = tf.exp(sigma-self.bias)
        rho = tf.tanh(rho)
        output_to_return = tf.concat([most_attended, binomial_param, pi, mu, sigma, rho],axis=1)
    return output_to_return, state_to_return

def state_size(self):
    return self.state_size

def output_size(self):
    return self.output_size

def gaussian_attention(self,sequence,params):
    with tf.name_scope("attention_calculation"):
        alpha, beta, kappa = tf.split(params,[self.attention_num,self.attention_num,self.attention_num],axis=1)
        seq_shape = sequence.shape
        seq_length = seq_shape[1]
        temp_vec = 20*np.asarray(range(seq_length),dtype=float)
        final_result = 0
        alpha = tf.split(alpha,self.attention_num,1)
        beta = tf.split(beta,self.attention_num,1)
        kappa = tf.split(kappa,self.attention_num,1)
        for i in range(self.attention_num):
            alpha_now = alpha[i]
            beta_now = beta[i]
            kappa_now = kappa[i]
            result = kappa_now - temp_vec
            result = tf.multiply(tf.square(result),tf.negative(beta_now))
            result = tf.multiply(tf.exp(result),alpha_now)
            final_result = final_result+result
        most_attended = tf.argmax(final_result,axis=1)
        most_attended = tf.reshape(tf.cast(most_attended,dtype=tf.float32),shape=[-1,1])
        final_result = tf.tile(tf.reshape(final_result,[-1,seq_shape[1],1]),[1,1,seq_shape[2]])
        to_return = tf.reduce_sum(tf.multiply(final_result,sequence),axis=1)
    return to_return, most_attended

and this is the rnn with loss network:

`to_write_one_hot = tf.placeholder(dtype=tf.float32,shape=(None,line_length,dict_length))
sequence = tf.placeholder(dtype=tf.float32,shape=(None,None,3))
sequence_shift = tf.placeholder(dtype=tf.float32,shape=(None,None,3))
bias = tf.placeholder(shape=[1],dtype=tf.float32)
sequence_length = tf.placeholder(shape=(None),dtype=tf.int32)
forget_bias_placeholder = tf.placeholder(shape=(None),dtype=tf.float32)

graves_cell = Custom_Cell(forget_bias=1,one_hot_vector=to_write_one_hot,hidden_layer_nums=hidden_layer_nums,mixture_num=mixture_num,bias=bias,attention_num=attention_num)

output, state = tf.nn.dynamic_rnn(graves_cell,sequence,dtype=tf.float32,sequence_length=sequence_length)

with tf.name_scope("loss_layer"):
    mask = tf.sign(tf.reduce_max(tf.abs(output), 2))
    most_attended, binomial_param, pi, mu, sigma, rho = tf.split(output,[1,1,mixture_num,2*mixture_num,2*mixture_num,mixture_num], axis=2)
    pi = tf.split(pi,mixture_num,axis=2)
    mu = tf.split(mu,mixture_num,axis=2)
    sigma = tf.split(sigma,mixture_num,axis=2)
    rho = tf.split(rho,mixture_num,axis=2)
    negative_log_likelihood = 0
    probability = 0
    x1, x2, e = tf.split(sequence_shift,3,axis=2)
    for i in range(mixture_num):
        pi_now = pi[i]
        mu_now = tf.split(mu[i],2,axis=2)
        mu_1 = mu_now[0]
        mu_2 = mu_now[1]
        sigma_now = tf.split(sigma[i],2,axis=2)
        sigma_1 = sigma_now[0] + (1-tf.reshape(mask, [-1,max_len,1]))
        sigma_2 = sigma_now[1] + (1-tf.reshape(mask, [-1,max_len,1]))
        rho_now = rho[i]
        Z = tf.divide(tf.square(x1-mu_1),tf.square(sigma_1)) + tf.divide(tf.square(x2-mu_2),tf.square(sigma_2)) - tf.divide(tf.multiply(tf.multiply(x1-mu_1,x2-mu_2),2*rho_now),tf.multiply(sigma_1,sigma_2))
        prob = tf.exp(tf.div(tf.negative(Z),2*(1-tf.square(rho_now))))
        Normalizing_factor = 2*np.pi*tf.multiply(sigma_1,sigma_2)
        Normalizing_factor = tf.multiply(Normalizing_factor,tf.sqrt(1-tf.square(rho_now)))
        prob = tf.divide(prob,Normalizing_factor)
        prob = tf.multiply(pi_now,prob)
        probability = probability + prob
    binomial_likelihood = tf.multiply(binomial_param,e) + tf.multiply(1-binomial_param,1-e)
    probability = tf.multiply(probability,binomial_likelihood)
    probability = probability + (1-tf.reshape(mask,[-1,max_len,1]))
    temp_tensor = tf.multiply(mask, tf.log(tf.reshape(probability,[-1,max_len]) + mask*0.00001))
    negative_log_likelihood_0 = tf.negative(tf.reduce_sum(temp_tensor,axis=1))
    negative_log_likelihood_1 = tf.divide(negative_log_likelihood_0,tf.reshape(tf.cast(sequence_length, dtype=tf.float32), shape=[-1,1]))
    negative_log_likelihood_1 = tf.reduce_mean(negative_log_likelihood_1)
    tf.summary.scalar("average_per_timestamp_log_likelihood", negative_log_likelihood_1)
    negative_log_likelihood = tf.reduce_mean(negative_log_likelihood_0)
    with tf.name_scope("train_op"):
        optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001,momentum=0.9, decay=0.95,epsilon=0.0001)
        gvs = optimizer.compute_gradients(negative_log_likelihood)
        capped_gvs = []
        for grad, var in gvs:
            if var.name.__contains__("rnn"):
                capped_gvs.append((tf.clip_by_value(grad,-10,10),var))
            else:
                capped_gvs.append((tf.clip_by_value(grad,-100,100),var))
        train_op = optimizer.apply_gradients(capped_gvs)
`

Edit.1. I discovered that I was clipping gradients in a wrong way, the correct way was to introduce a new 'op' as explained by https://github.com/tensorflow/tensorflow/issues/2793 to clip only the output gradients of the whole network and lstm cells.

@tf.custom_gradient
def clip_gradient(x, clip):
  def grad(dresult):
    return [tf.clip_by_norm(dresult, clip)]
  return x, grad

add the lines above to your code and use the function on any variable you want to clip the gradient in back propagation!

I should still see my results.

Edit 2. The changed Model code is:

from tensorflow.contrib.rnn import RNNCell
from tensorflow.contrib.rnn import LSTMCell
from tensorflow.contrib.rnn import LSTMStateTuple
import tensorflow as tf
import numpy as np

@tf.custom_gradient
def clip_gradient_lstm(x):
  def grad(dresult):
    return [tf.clip_by_value(dresult,-10,10)]
  return x, grad

@tf.custom_gradient
def clip_gradient_output(x):
  def grad(dresult):
    return [tf.clip_by_value(dresult,-100,100)]
  return x, grad

def length_of(seq):
    used = tf.sign(tf.reduce_max(tf.abs(seq),axis=2))
    length = tf.reduce_sum(used,1)
    length = tf.cast(length,tf.int32)
    return length

class Custom_Cell(RNNCell):

    def __init__(self,forget_bias,bias,one_hot_vector, hidden_layer_nums=[700,700,700], mixture_num=10, attention_num=4):
        self.bias = bias
        self.lstms = []
        for i in hidden_layer_nums:
            self.lstms.append(LSTMCell(num_units=i, initializer=tf.truncated_normal_initializer(0.075), dtype=tf.float32, forget_bias=forget_bias))
        self.attention_num = attention_num
        self.mixture_num = mixture_num
        self.state_size = 2*sum(hidden_layer_nums) + 3*self.attention_num
        self.attention_var_num = 3*self.attention_num
        self.output_size = 6*self.mixture_num + 1 + 1
        self.one_hot_vector = one_hot_vector
        self.lstm_num = len(hidden_layer_nums)
        self.hidden_layer_nums = hidden_layer_nums
        temp_shape = self.one_hot_vector.shape
        self.char_num = temp_shape[2]
        self.i_to_h = []
        self.w_to_h = []
        self.h_to_h = []
        self.prev_h_to_h = []
        self.lstm_bias = []
        self.lstm_to_attention_weights = tf.get_variable("lstms/first_to_attention_mtrx",shape=[hidden_layer_nums[0],self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
        self.lstm_to_attention_bias = tf.get_variable("lstms/first_to_attention_bias",shape=[self.attention_var_num],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
        self.all_to_output_mtrx = []
        for i in range(self.lstm_num):
            self.all_to_output_mtrx.append( tf.get_variable("lstms/to_output_mtrx_" + str(i), shape=[hidden_layer_nums[i],self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
        self.all_to_output_bias = tf.get_variable("lstms/output_bias",shape=[self.output_size-1],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True)
        for i in range(self.lstm_num):
            self.i_to_h.append(tf.get_variable("lstms/i_to_h_"+str(i),shape=[3,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
            self.w_to_h.append(tf.get_variable("lstms/w_to_h_"+str(i),shape=[self.char_num,hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
            self.h_to_h.append(tf.get_variable("lstms/h_to_h_"+str(i),shape=[hidden_layer_nums[i],hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
            self.lstm_bias.append(tf.get_variable("lstms/bias_" + str(i),shape=[hidden_layer_nums[i]],dtype=tf.float32,initializer=tf.truncated_normal_initializer(stddev=0.075),trainable=True))
            if not i == 0:
                self.prev_h_to_h.append(
                    tf.get_variable("lstms/prev_h_to_h_" + str(i), shape=[hidden_layer_nums[i-1], hidden_layer_nums[i]],
                                    dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.075),
                                    trainable=True))

    def __call__(self, inputs, state, scope=None):
        # Extracting previous configuration and vectors
        splitarray = []
        for i in self.hidden_layer_nums:
            splitarray.append(i)
            splitarray.append(i)
        splitarray.append(3*self.attention_num)
        splitted = tf.split(state,splitarray,axis=1)
        prev_tuples = []
        for i in range(self.lstm_num):
            newtuple = LSTMStateTuple(splitted[2*i],splitted[2*i + 1])
            prev_tuples.append(newtuple)
        prev_attention_vec = splitted[2*self.lstm_num]
        new_attention_vec = 0
        next_states = []
        most_attended = 0
        last_output = 0
        for i in range(self.lstm_num):
            prev_c, prev_h = prev_tuples[i]
            cell = self.lstms[i]
            if i == 0:
                with tf.name_scope("layer_1"):
                    w, most_attended = self.gaussian_attention(self.one_hot_vector,prev_attention_vec)
                    input_vec = tf.matmul(inputs,self.i_to_h[0]) + tf.matmul(prev_h,self.h_to_h[0]) + tf.matmul(w,self.w_to_h[0]) + self.lstm_bias[0]
                    _, new_state = cell(input_vec, prev_tuples[0])
                    new_c, new_h = new_state
                    new_h = clip_gradient_lstm(new_h)
                    next_states.append(new_c)
                    next_states.append(new_h)
                    last_output = tf.matmul(new_h,self.all_to_output_mtrx[0])
                    with tf.name_scope("attention_layer"):
                        temp_attention = tf.matmul(new_h,self.lstm_to_attention_weights) + self.lstm_to_attention_bias
                        new_alpha, new_beta, new_kappa = tf.split(temp_attention,[self.attention_num,self.attention_num,self.attention_num],axis=1)
                        old_alpha, old_beta, old_kappa = tf.split(prev_attention_vec,[self.attention_num,self.attention_num,self.attention_num], axis=1)
                        new_alpha = tf.exp(new_alpha)
                        new_beta = tf.exp(new_beta)
                        new_kappa = tf.exp(new_kappa) + old_kappa
                        new_attention_vec = tf.concat([new_alpha,new_beta,new_kappa],axis=1)

            else:
                with tf.name_scope("layer_" + str(i)):
                    w, most_attended = self.gaussian_attention(self.one_hot_vector,new_attention_vec)
                    input_vec = tf.matmul(inputs,self.i_to_h[i]) + tf.matmul(next_states[-1],self.prev_h_to_h[i-1]) + tf.matmul(prev_h,self.h_to_h[i]) + tf.matmul(w,self.w_to_h[i]) + self.lstm_bias[i]
                    _,new_state = cell(input_vec,prev_tuples[i])
                    new_c, new_h = new_state
                    new_h = clip_gradient_lstm(new_h)
                    next_states.append(new_c)
                    next_states.append(new_h)
                    last_output = last_output + tf.matmul(new_h, self.all_to_output_mtrx[i])
        with tf.name_scope("output"):
            last_output = last_output + self.all_to_output_bias
            last_output = clip_gradient_output(last_output)
            next_states.append(new_attention_vec)
            state_to_return = tf.concat(next_states,axis=1)
            output_split_param = [1,self.mixture_num,2*self.mixture_num,2*self.mixture_num,self.mixture_num]
            binomial_param, pi, mu, sigma, rho = tf.split(last_output,output_split_param,axis=1)
            binomial_param = tf.divide(1.,1.+tf.exp(binomial_param))
            pi = tf.nn.softmax(tf.multiply(pi,1.+self.bias),axis=1)
            mu = mu
            sigma = tf.exp(sigma-self.bias)
            rho = tf.tanh(rho)
            output_to_return = tf.concat([most_attended, binomial_param, pi, mu, sigma, rho],axis=1)
        return output_to_return, state_to_return

    def state_size(self):
        return self.state_size

    def output_size(self):
        return self.output_size

    def gaussian_attention(self,sequence,params):
        with tf.name_scope("attention_calculation"):
            alpha, beta, kappa = tf.split(params,[self.attention_num,self.attention_num,self.attention_num],axis=1)
            seq_shape = sequence.shape
            seq_length = seq_shape[1]
            temp_vec = np.asarray(range(seq_length),dtype=float)
            final_result = 0
            alpha = tf.split(alpha,self.attention_num,1)
            beta = tf.split(beta,self.attention_num,1)
            kappa = tf.split(kappa,self.attention_num,1)
            for i in range(self.attention_num):
                alpha_now = alpha[i]
                beta_now = beta[i]
                kappa_now = kappa[i]
                result = kappa_now - temp_vec
                result = tf.multiply(tf.square(result),tf.negative(beta_now))
                result = tf.multiply(tf.exp(result),alpha_now)
                final_result = final_result+result
            most_attended = tf.argmax(final_result,axis=1)
            most_attended = tf.reshape(tf.cast(most_attended,dtype=tf.float32),shape=[-1,1])
            final_result = tf.tile(tf.reshape(final_result,[-1,seq_shape[1],1]),[1,1,seq_shape[2]])
            to_return = tf.reduce_sum(tf.multiply(final_result,sequence),axis=1)
        return to_return, most_attended

and the Training is done by

     with tf.name_scope("train_op"):
        optimizer = 
tf.train.RMSPropOptimizer(learning_rate=0.0001,momentum=0.9, decay=0.95,epsilon=0.0001,centered=True)
        train_op = optimizer.minimize(negative_log_likelihood)

and right now is still in training, but it is now as low as -10.

  • Had you succesfully use tensorflow for other deep learning tasks? Or you just read about tensorflow, and the first thing you tried to do with it is reproducing this model? Or you successfully reproduced half dozen models from other articles, but cannot reproduce Alex Graves' model? Or something else? – user31264 Sep 15 '18 at 08:52
  • @user31264 Yes, and you can see some of them in my github account at https://github.com/ariahala – aria halavati Sep 15 '18 at 09:12

0 Answers0