I am working on an encoder-decoder
chat bot that consists of an embedding layer
, two layers of LSTM
and a fully connected layer
on top of the decoder.
After I load the checkpoint file
, the loss
is way higher than it was the last time I saved the model, and the results of the chat bot are, as expected, way worse. And yet, the model is not back at his initial state. Which means that if I saved the model when it had 2.4 loss, it will load on 4-5 loss instead of 10 (which is the loss the model had before it started learning).
Also, the model learns a lot faster after the weights have been loaded which leads me to believe that some of the weights have been loaded successfully while some have not.
I'm using this code to build the model and load the checkpoint
in the __init__
function:
self.__gather_data()
self.__build_model()
tf.global_variables_initializer().run(session=self.sess)
self.saver = tf.train.Saver(tf.global_variables())
try:
self.saver.restore(self.sess, self.checkpoint_path)
except:
print('Starting from scratch.')
This is how I build the model in the __build_model
function:
# placeholders
with tf.variable_scope(self.scope + '-placeholders'):
self.inputs = tf.placeholder(tf.int32,[None, self.input_length], name='inputs')
self.outputs = tf.placeholder(tf.int32, [None, None], name='outputs')
self.targets = tf.placeholder(tf.int32, [None, None], name='targets')
# embedding
with tf.variable_scope(self.scope + 'embedding'):
self.input_embedding = tf.Variable(tf.ones((self.vocab_size, self.embed_size)))
self.output_embedding = tf.Variable(tf.ones((self.vocab_size, self.embed_size)))
input_embed = tf.nn.embedding_lookup(self.input_embedding, self.inputs)
output_embed = tf.nn.embedding_lookup(self.output_embedding, self.outputs)
# encoder
with tf.variable_scope(self.scope + '-encoder'):
lstm_enc_1 = tf.contrib.rnn.LSTMCell(self.hidden_size, reuse=tf.AUTO_REUSE)
lstm_enc_2 = tf.contrib.rnn.LSTMCell(self.hidden_size, reuse=tf.AUTO_REUSE)
_, last_state = tf.nn.dynamic_rnn(tf.contrib.rnn.MultiRNNCell(cells=[lstm_enc_1, lstm_enc_2]), inputs=input_embed, dtype=tf.float32)
# decoder
with tf.variable_scope(self.scope + '-decoder'):
lstm_dec_1 = tf.contrib.rnn.LSTMCell(self.hidden_size, reuse=tf.AUTO_REUSE)
lstm_dec_2 = tf.contrib.rnn.LSTMCell(self.hidden_size, reuse=tf.AUTO_REUSE)
dec_outputs, _ = tf.nn.dynamic_rnn(tf.contrib.rnn.MultiRNNCell(cells=[lstm_dec_1, lstm_dec_2]), inputs=output_embed, initial_state=last_state, dtype=tf.float32)
self.logits = tf.contrib.layers.fully_connected(dec_outputs, num_outputs=self.vocab_size, activation_fn=None, reuse=tf.AUTO_REUSE, scope='fully_connected')
# loss and optimizer
with tf.variable_scope(self.scope + '-optimizing'):
self.loss = tf.contrib.seq2seq.sequence_loss(self.logits, self.targets, tf.ones([self.batch_size, self.input_length]))
self.optimizer = tf.train.RMSPropOptimizer(0.001).minimize(self.loss)
And I'm using this function to save the weights while training:
self.saver.save(self.sess, self.checkpoint_path)