I am trying to build a word-level language model in TensorFlow. My inputs are batches with word id's of shape (batch_size, seq_length)
, my targets are the inputs shifted one time step to the left (so for each word, the target is the next word in the sequence).
The model receives word embeddings as an input (word embeddings were pre-trained using gensim word2vec). I manually checked that the word embeddings are read in correctly and that they correspond to the right word id's.
Although I have tried out a lot of things, my model is not improving. Even when training for 100 epochs over the full training set, the accuracy remains the same.
What I have tried (without any success):
- Removing dropout. My first goal is to get rid of underfitting
- Different vocabulary size (100, 1000, 10000)
- Using gradient clipping/ not using gradient clipping
- Changing the initialization of the weights
- Data shuffling
- different optimizer (RSMProp, Adam and Gradient Descent)
- larger/smaller model (2-4 hidden layers with 128-256 hidden units)
- different batch size (10, 20, 128)
- different learning rate (0.01, 0.001, 0.1)
- different loss function (sparse_softmax_cross_entropy_with_logits or tf.contrib.seq2seq.sequence_loss)
- refeeding/not refeeding the final state of the LSTM during training*
In the beginning, both loss and accuracy are improving. Also, the model is adapting its predictions. But then, after some epochs over the full training set, loss and accuracy stay constant. Also, the model predictions aren't changing anymore and it gets stuck. Here is an example that shows the development of loss and accuracy for the same input sequence. After epoch 30, nothing is changing anymore:
2017-11-08 06:59:24,298 - DEBUG - Targets: [ 91 4 9 116 237 1953 240 3 2 1 0 2 1 9 144 351 29 299 24 453]
2017-11-08 06:59:24,299 - DEBUG - Predicted sequence: [0 0 0 0 0 0 0 0 2 1 0 0 1 0 0 0 0 0 0 0]
2017-11-08 06:59:24,299 - INFO - Current epoch: 1
2017-11-08 06:59:24,299 - INFO - Current training step: 2000
2017-11-08 06:59:24,299 - INFO - Current loss: 107.67147064208984
2017-11-08 06:59:24,299 - INFO - Current accuracy: 0.1599999964237213
2017-11-08 07:04:09,559 - DEBUG - Targets: [ 91 4 9 116 237 1953 240 3 2 1 0 2 1 9 144 351 29 299 24 453]
2017-11-08 07:04:09,560 - DEBUG - Predicted sequence: [ 4 4 6 6 16 0 0 3 2 1 9 2 1 0 0 4 0 0 4 8]
2017-11-08 07:04:09,560 - INFO - Current epoch: 5
2017-11-08 07:04:09,560 - INFO - Current training step: 2000
2017-11-08 07:04:09,560 - INFO - Current loss: 97.8116455078125
2017-11-08 07:04:09,560 - INFO - Current accuracy: 0.2150000035762787
2017-11-08 07:43:03,875 - DEBUG - Targets: [ 91 4 9 116 237 1953 240 3 2 1 0 2 1 9 144 351 29 299 24 453]
2017-11-08 07:43:03,875 - DEBUG - Predicted sequence: [ 6 4 9 55 47 0 5 3 2 1 9 2 1 0 55 24 0 0 3 6]
2017-11-08 07:43:03,876 - INFO - Current epoch: 30
2017-11-08 07:43:03,876 - INFO - Current training step: 2000
2017-11-08 07:43:03,876 - INFO - Current loss: 84.75357055664062
2017-11-08 07:43:03,876 - INFO - Current accuracy: 0.2549999952316284
I have been working on this for a week already and I don't know what I can try out anymore. I would be super grateful for any tips or ideas.
The important parts of the code are here:
def build_graph(self, graph):
with graph.as_default():
tf.set_random_seed(self.random_seed)
with tf.variable_scope('embedding'):
embedding_matrix = tf.get_variable(name='embedding_matrix', shape=self.embds.shape, initializer=tf.constant_initializer(self.embds), trainable=False)
with tf.name_scope('input'):
self.input_batch = tf.placeholder(tf.int64, shape=(None, self.seq_length))
self.inputs = tf.nn.embedding_lookup(embedding_matrix, self.input_batch)
self.label_batch = tf.placeholder(tf.int64, shape=(None, self.seq_length))
with tf.name_scope('rnn'):
# Set up the RNN architecture
cells = []
for i in range(self.n_layers):
cell = tf.contrib.rnn.LSTMCell(self.n_hidden, initializer=tf.contrib.layers.xavier_initializer())#use_peepholes=True,
# Add dropout (only used during training)
# cell = tf.contrib.rnn.DropoutWrapper(
# cell,
# output_keep_prob=(1.0 if not self.config['train'] else
# self.dropout_keep_prob))
cells.append(cell)
cell = tf.contrib.rnn.MultiRNNCell(
cells, state_is_tuple=True)
# Create a zero-filled state tensor as an initial state
self.init_state = cell.zero_state(self.batch_size, tf.float32)
# Create a recurrent neural network
output, self.final_state = tf.nn.dynamic_rnn(
cell,
inputs=self.inputs,
initial_state=self.init_state)
# OLD VERSION
# self.logits = tf.contrib.layers.fully_connected(outputs, self.vocab_size, activation_fn=None)
# NEW VERSION
# Try out part of tensorflow tutorial
self.output_flat = tf.reshape(output, [-1, cell.output_size])
softmax_w = tf.get_variable("softmax_w", [self.n_hidden, self.vocab_size], dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b", [self.vocab_size], dtype=tf.float32)
logits = tf.nn.xw_plus_b(self.output_flat, softmax_w, softmax_b)
# Reshape logits to be a 3-D tensor for sequence loss
self.logits = tf.reshape(logits, [self.batch_size, self.seq_length, self.vocab_size])
# Use the contrib sequence loss and average over the batches
loss = tf.contrib.seq2seq.sequence_loss(
self.logits,
self.label_batch,
tf.ones([self.batch_size, self.seq_length], dtype=tf.float32),
average_across_timesteps=False, average_across_batch=True)
self.loss = tf.reduce_sum(loss)
with tf.name_scope('prediction'):
# Compute real-valued predictions of the network
self.predictions = tf.argmax(self.logits, axis=2)
# Compute the softmax
# softmax_ce = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.label_batch, logits=self.logits)
#with tf.name_scope("loss"):
# Compute the loss (cross-entropy)
# self.loss = tf.reduce_mean(softmax_ce)
with tf.name_scope("metrics"):
# Compute accuracy and perplexity for evaluation
correct_predictions = tf.to_float(tf.equal(self.label_batch, self.predictions))
self.perplexity = tf.reduce_mean(tf.exp(softmax_ce))
self.accuracy = tf.reduce_mean(correct_predictions)
with tf.name_scope('train'):
# Create a global step variable
self.global_step = tf.Variable(
0,
trainable=False,
name="global_step",
collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ])
# Get all variables created with trainable=True
parameters = tf.trainable_variables()
# Compute the gradient of the loss w.r.t to the params
gradients = tf.gradients(self.loss, parameters)
# Clip the gradients. How this works: Given a tensor t, and a maximum
# clip value clip_norm the op normalizes t so that its L2-norm is less
# than or equal to clip_norm
clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.clip_norm)
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, epsilon=0.1)
# Apply the optimizer
self.train_step = self.optimizer.apply_gradients(zip(clipped_gradients, parameters), global_step=self.global_step)
# If not clipping the gradients, minimize the loss directly
# self.train_step = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)
# self.train_step = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)
self._create_summaries()
return graph
def train(self, save_every=20):
with self.graph.as_default():
# Initialize the state of the network
feed2 = np.zeros((self.batch_size, self.n_hidden))
t = tuple((feed2, feed2))
_current_state = np.array([t, t])
training_step = 0
for epoch_id in range(0, self.n_epochs):
m, n = self.x_train.shape
self.n_batches = int(m//self.batch_size)
for batch_number in range(0, self.n_batches):
training_step += 1
from_index = batch_number*self.batch_size
to_index = (batch_number+1)*self.batch_size
_inputs = self.x_train[from_index:to_index,:]
_labels = self.y_train[from_index:to_index,:]
# Run training step
# The final state of the net is fed back into the net
_logits, _predictions, _train_step, _current_state, _loss, _acc, summary = self.sess.run(
[self.logits,
self.predictions,
self.train_step,
self.final_state,
self.loss,
self.accuracy,
#self.perplexity,
self.merged],
feed_dict={
self.input_batch: _inputs,
self.label_batch: _labels,
self.init_state[0][0]: _current_state[0][0],
self.init_state[0][1]: _current_state[0][1],
self.init_state[1][0]: _current_state[1][0],
self.init_state[1][1]: _current_state[1][1],
})
pred = _predictions[0]
if batch_number % 2000 == 0:
self.sw.add_summary(summary, training_step)
tf.logging.debug("Targets: {}".format(_labels[0]))
tf.logging.debug("Predicted sequence: {}".format(pred))
tf.logging.info("Current epoch: {}".format(epoch_id))
tf.logging.info("Current training step: {}".format(batch_number))
tf.logging.info("Current loss: {}".format(_loss))
tf.logging.info("Current accuracy: {}".format(_acc))
tf.logging.info("Current perplexity: {}".format(_perpl))
self.save(epoch_id)