I'm having trouble training a seq2seq model using Tensorflow on an Nvidia P100 GPU. Here are the versions I'm using: TensorFlow 1.10.0, Keras 2.2.2, Python 3.6.3, CUDA 9.2.148.1, cuDNN 7.2.1
I currently get an OOM error well in the middle of training (18 minutes).
I've been doing a little digging and tried setting allow_growth = True
(flag not set in the code below) but did not manage to see any memory grow, it all gets allocated at the start.
I also tried setting the graph to read only with tf.finalize()
but the program still runs, which suggests no nodes are being added or that if haven't placed the function in the correct place in the code.
Since the graph trains and can be saved and all, it doesn't seem to be too large at the start.
Here are some of the hyper parameters I'm using:
batch_size = 10
rnn_size = 1024
src/tgt_vocab_size = 8000
epochs = 20
display_steps = 10
The dataset used are docstrings and the associated function code, so not incredibly long. One of my original thoughts was that since the size of the sentences is dynamic, one really long one could be too big. But I shuffled the dataset to see if the crash happened at a different time and it's still at 18 minutes with the same parameters.
Here is the code including the graphs and the training/testing loop.
def train(...):
...
...
def source_generator():
for el in train_source_sents:
yield el
def target_generator():
for el in train_target_sents:
yield el
train_graph = tf.Graph()
with train_graph.as_default():
# Dataset and batch preparation
with tf.name_scope("Dataset-prep"):
source_dataset = tf.data.Dataset.from_generator(source_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = tf.data.Dataset.from_generator(target_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = target_dataset.map(lambda x: tf.concat([x, [target_tok2ID['<EOS>']]], 0)) # Adding <EOS> character to the endo of all the target sentences
target_input_dataset = target_dataset.map(lambda x: tf.concat([[target_tok2ID['<GO>']], x], 0)) # Creating training inputs for the decoder, This requires adding <GO> to the start of the sequence
target_sequence_length = target_dataset.map(lambda x: tf.shape(x)[0]) # Adding the sizes of all the sequences to be paired up with the reset of the dataset
dataset = tf.data.Dataset.zip((source_dataset, target_dataset, target_input_dataset, target_sequence_length)) # create the collection of all the individual datasets
dataset = dataset.shuffle(buffer_size=10000)
with tf.name_scope("Dataset-batching"):
dataset = dataset.repeat(epochs)
pad_id = target_tok2ID['<PAD>']
batched_dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None],[None], []), padding_values=(pad_id,pad_id,pad_id,pad_id))
batched_dataset = batched_dataset.prefetch(buffer_size=batch_size) # could be removed, perhaps yeilds improvements
iterator = batched_dataset.make_one_shot_iterator()
source_batch, target_batch, target_input_batch, batch_target_sequence_length = iterator.get_next()
with tf.name_scope("Encoding-layer"):
source_vocab_size = len(source_tok2ID)
embed = tf.contrib.layers.embed_sequence(source_batch, vocab_size=source_vocab_size, embed_dim=encoding_embedding_size)
stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
outputs, encoder_state = tf.nn.dynamic_rnn(stacked_cells, embed, dtype=tf.float32)
with tf.name_scope("Decoding-layer"):
target_vocab_size = len(target_tok2ID)
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, target_input_batch)
cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
output_layer = tf.layers.Dense(target_vocab_size)
dec_cell = tf.contrib.rnn.DropoutWrapper(cells, output_keep_prob=keep_prob)
helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, batch_target_sequence_length)
decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, encoder_state, output_layer)
max_target_sequence_length = tf.reduce_max(batch_target_sequence_length, axis=0)
dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
dec_outputs = tf.identity(dec_outputs.rnn_output, name='logits') # This step might seem a little misterious, but it is to take the output from the dynamic decoder and pass it to a tensor from (Dodumentation is scarce here)
masks = tf.sequence_mask(batch_target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
cost = tf.contrib.seq2seq.sequence_loss( dec_outputs, target_batch, masks)
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
tf.summary.scalar('cost', cost)
merged = tf.summary.merge_all()
train_saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
train_sess = tf.Session(graph=train_graph)
if(model_path):
print(model_path)
train_saver.restore(train_sess, model_path)
else:
train_sess.run(init_op)
store_path = "./visualisations/"
writer = tf.summary.FileWriter(store_path, train_sess.graph)
step = 0
start_time = datetime.now()
while True:
try:
step += 1
model_cost, _, summary = train_sess.run((cost, train_op, merged))
writer.add_summary(summary,step)
if(step % display_step == 0):
save_path = train_saver.save(train_sess, "./checkpoints/NMT")
print("Model saved in path: %s" % save_path)
test_sess = tf.Session(graph=test_graph)
test_saver.restore(test_sess, save_path)
# print(test_sess.run(foo))
scores = []
while True:
try:
predictions, refrence = test_sess.run([dec_predictions, test_target_batch])
for i in range(len(refrence)):
BLEU_score = nltk.translate.bleu_score.sentence_bleu([np.trim_zeros(refrence[i])], np.trim_zeros(predictions[i]), weights = (1,0))
scores.append(BLEU_score)
print("########################################")
print("ref:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(refrence[i]))))
print("")
print("pred:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(predictions[i]))))
except tf.errors.OutOfRangeError:
print("Exhausted test data")
break
delta_time = datetime.now() - start_time
total_exp_time = delta_time * (total_steps / step)
remaining_time = total_exp_time - delta_time
print("")
print("Test set BLEU Score:", np.mean(scores))
print("Model cost:" ,model_cost)
print("Step {} from {}".format(step, total_steps))
print("Current time:", datetime.now())
print("Total Experiment time (Hours:Minutes:Seconds):", str(total_exp_time))
print("Time Elapled (Hours:Minutes:Seconds):", str(delta_time))
print("Time remaining (Hours:Minutes:Seconds):", str(remaining_time))
print("")
except tf.errors.OutOfRangeError:
print("Model finished training")
break
return save_path
Here is an output of a training run: command line output
Is there something wrong with the way I'm executing the graph, or am I repeating some step leading to the memory fill up?
Thanks for all your help!