1

I am training an RNN-based language-model using Tensorflow. The model is very similar to the PTB model example in the TF tutorials section. However, when I attempt to train the model on my own data, the perplexity of the model does not go down; it remains constant throughout multiple epochs. Could anyone let me know what I might be doing wrong.

I have a feeling that I am not handling the targets properly, but the gist of my code for the targets is:

def batcher(batch_size,unroll_steps,data,pad):
    print(len(data))
    batches = len(data) / batch_size
    inp = []
    target = []
    for i in range(batches):
            #print(len(data[i*batch_size:(i+1)*batch_size]))
            x = data[i*batch_size:(i+1)*batch_size]
            y =  [ line[1:]+[pad] for line in x ]
            yield (x,y)

That is, I just shift the data by 1 and use that as the target for the next word in a sentence.

The training script and model (class) are seen below

Training script (excerpt):

def train(session, model, folder,batch_size,unroll_steps,epoch):

    word_to_id, id_to_word, train, val = build_inputs(folder,unroll_steps)
    pad = word_to_id['<pad>']
    costs = 0
    iters = 0
    train_size = len(train)
    batch_size = model.batch_size
    batches = train_size / batch_size
    state = session.run(model._initial_state)
    print("Running epoch %d" % epoch)
    for i in range(batches):
            fetches = [model.cost, model._final_state, model.logits]
            feed_dict = {}
            x = train[i*batch_size:(i+1)*batch_size]
            y = [ line[1:] +[pad] for line in x ]
            feed_dict[model.input] = x
            feed_dict[model.targets] = y
            feed_dict[model._initial_state] = state
            #print("Cell-state complete - Running")
            cost, state, logits = session.run(fetches, feed_dict)
            #print("Single Run complete")
            costs += cost
            iters += model.unroll_steps
    print("\tEpoch %d: Perplexity is %f" % (epoch, np.exp(costs/iters)))

    return np.exp(costs/iters)

Model:

import tensorflow as tf

class LM(object):

    def __init__(self, train, max_gradient, batch_size, unroll_steps, vocab, size, layers, learning_rate, init, prob):
            self.batch_size = batch_size
            self.max_gradient = max_gradient
            self.layers = layers
            self.learning_rate = learning_rate
            self.unroll_steps = unroll_steps
            self.init = init
            #with tf. name_scope("Paramters"):

            with tf.device('/gpu:0'), tf.name_scope("Input"):
                    self.input = tf.placeholder(tf.int64, shape=[batch_size, unroll_steps], name="input")
                    self.targets = tf.placeholder(tf.int64, shape=[batch_size, unroll_steps], name="targets")
                    #self.init = tf.placeholder(tf.float32, shape=[], name="init")

            with tf.device('/gpu:0'), tf.name_scope("Embedding"):
                    embedding = tf.Variable(tf.random_uniform([vocab, size], -self.init, self.init), dtype=tf.float32, name="embedding")
                    embedded_input = tf.nn.embedding_lookup(embedding, self.input, name="embedded_input")

            with tf.device('/gpu:0'), tf.name_scope("RNN"), tf.variable_scope(tf.get_variable_scope(), reuse = False) as scope:
                    lstm_cell = tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True)
                    if train and prob < 1.0:
                            lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=prob)
                    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for _ in range(layers)], state_is_tuple=True)

                    self._initial_state = cell.zero_state(batch_size, tf.float32)
                    outputs = []
                    state = self._initial_state
                    for step in range(unroll_steps):
                            if step > 0: tf.get_variable_scope().reuse_variables()
                            (cell_output, state) = cell(embedded_input[:, step, :], state)
                            outputs.append(cell_output)

            with tf.device('/gpu:0'), tf.name_scope("Cost"), tf.variable_scope(tf.get_variable_scope(), reuse = False) as scope:
                    output = tf.reshape(tf.concat(outputs,1), [-1,size])
                    softmax_w = tf.get_variable("softmax_w", [size, vocab], dtype=tf.float32)
                    softmax_b = tf.get_variable("softmax_b", [vocab], dtype=tf.float32)
                    logits = tf.matmul(output, softmax_w) + softmax_b
                    losses = []
                    for logit, target in zip([logits], [tf.reshape(self.targets,[-1])]):
                            target = tf.reshape(target, [-1])
                            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logit,labels=target)
                            losses.append(loss)
                    self.cost = tf.reduce_sum(losses) / batch_size
                    self._final_state = state
                    self.logits = logits
                    scope.reuse_variables()

            if not train:
                    return

            with tf.device('/gpu:0'), tf.name_scope("Train"), tf.variable_scope(tf.get_variable_scope(), reuse=False):
                    train_variables = tf.trainable_variables()
                    gradients, _ = tf.clip_by_global_norm(tf.gradients(self.cost, train_variables),self.max_gradient)
                    optimizer = tf.train.AdamOptimizer(self.learning_rate)
                    self.training = optimizer.apply_gradients(zip(gradients, train_variables))
                    tf.get_variable_scope().reuse_variables()
user227837
  • 129
  • 1
  • 10

0 Answers0