Tensorflow GPU Out Of Memory during runtime using dynamic_rnn

Question

I'm having trouble training a seq2seq model using Tensorflow on an Nvidia P100 GPU. Here are the versions I'm using: TensorFlow 1.10.0, Keras 2.2.2, Python 3.6.3, CUDA 9.2.148.1, cuDNN 7.2.1

I currently get an OOM error well in the middle of training (18 minutes). I've been doing a little digging and tried setting allow_growth = True (flag not set in the code below) but did not manage to see any memory grow, it all gets allocated at the start.
I also tried setting the graph to read only with tf.finalize() but the program still runs, which suggests no nodes are being added or that if haven't placed the function in the correct place in the code.

Since the graph trains and can be saved and all, it doesn't seem to be too large at the start.

Here are some of the hyper parameters I'm using:
batch_size = 10
rnn_size = 1024
src/tgt_vocab_size = 8000
epochs = 20
display_steps = 10

The dataset used are docstrings and the associated function code, so not incredibly long. One of my original thoughts was that since the size of the sentences is dynamic, one really long one could be too big. But I shuffled the dataset to see if the crash happened at a different time and it's still at 18 minutes with the same parameters.

Here is the code including the graphs and the training/testing loop.

def train(...):
   ...

   ...
   def source_generator():
        for el in train_source_sents:
            yield el

    def target_generator():
        for el in train_target_sents:
            yield el

    train_graph = tf.Graph()
    with train_graph.as_default():

        # Dataset and batch preparation
        with tf.name_scope("Dataset-prep"):
            source_dataset = tf.data.Dataset.from_generator(source_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None])))  # Converting sentence array to dataset
            target_dataset = tf.data.Dataset.from_generator(target_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None])))  # Converting sentence array to dataset
            target_dataset = target_dataset.map(lambda x: tf.concat([x, [target_tok2ID['<EOS>']]], 0))        # Adding <EOS> character to the endo of all the target sentences
            target_input_dataset = target_dataset.map(lambda x: tf.concat([[target_tok2ID['<GO>']], x], 0))   # Creating training inputs for the decoder, This requires adding <GO> to the start of the sequence
            target_sequence_length = target_dataset.map(lambda x: tf.shape(x)[0])             # Adding the sizes of all the sequences to be paired up with the reset of the dataset
            dataset = tf.data.Dataset.zip((source_dataset, target_dataset, target_input_dataset, target_sequence_length))  # create the collection of all the individual datasets
            dataset = dataset.shuffle(buffer_size=10000)
        with tf.name_scope("Dataset-batching"):
            dataset = dataset.repeat(epochs)
            pad_id = target_tok2ID['<PAD>']
            batched_dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None],[None], []), padding_values=(pad_id,pad_id,pad_id,pad_id))
            batched_dataset = batched_dataset.prefetch(buffer_size=batch_size) # could be removed, perhaps yeilds improvements
            iterator = batched_dataset.make_one_shot_iterator()
            source_batch, target_batch, target_input_batch, batch_target_sequence_length = iterator.get_next()

        with tf.name_scope("Encoding-layer"):
            source_vocab_size = len(source_tok2ID)
            embed = tf.contrib.layers.embed_sequence(source_batch, vocab_size=source_vocab_size, embed_dim=encoding_embedding_size)
            stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
            outputs, encoder_state = tf.nn.dynamic_rnn(stacked_cells, embed, dtype=tf.float32)

        with tf.name_scope("Decoding-layer"):
            target_vocab_size = len(target_tok2ID)
            dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
            dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, target_input_batch)
            cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
            output_layer = tf.layers.Dense(target_vocab_size)
            dec_cell = tf.contrib.rnn.DropoutWrapper(cells, output_keep_prob=keep_prob)
            helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, batch_target_sequence_length)
            decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, encoder_state, output_layer)
            max_target_sequence_length = tf.reduce_max(batch_target_sequence_length, axis=0)
            dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
            dec_outputs = tf.identity(dec_outputs.rnn_output, name='logits') # This step might seem a little misterious, but it is to take the output from the dynamic decoder and pass it to a tensor from (Dodumentation is scarce here)

        masks = tf.sequence_mask(batch_target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
        cost = tf.contrib.seq2seq.sequence_loss( dec_outputs, target_batch, masks)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

        tf.summary.scalar('cost', cost)
        merged = tf.summary.merge_all()

        train_saver = tf.train.Saver()
        init_op = tf.global_variables_initializer()

    train_sess = tf.Session(graph=train_graph)
    if(model_path):
        print(model_path)
        train_saver.restore(train_sess, model_path)
    else:
        train_sess.run(init_op)
    store_path = "./visualisations/"
    writer = tf.summary.FileWriter(store_path, train_sess.graph)
    step = 0
    start_time = datetime.now()
    while True:
        try:
            step += 1
            model_cost, _, summary = train_sess.run((cost, train_op, merged))
            writer.add_summary(summary,step)
            if(step % display_step == 0):
                save_path = train_saver.save(train_sess, "./checkpoints/NMT")
                print("Model saved in path: %s" % save_path)
                test_sess = tf.Session(graph=test_graph)
                test_saver.restore(test_sess, save_path)
                # print(test_sess.run(foo))
                scores = []
                while True:
                    try:
                        predictions, refrence = test_sess.run([dec_predictions, test_target_batch])
                        for i in range(len(refrence)):
                            BLEU_score = nltk.translate.bleu_score.sentence_bleu([np.trim_zeros(refrence[i])], np.trim_zeros(predictions[i]), weights = (1,0))
                            scores.append(BLEU_score)
                            print("########################################")
                            print("ref:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(refrence[i]))))
                            print("")
                            print("pred:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(predictions[i]))))
                    except tf.errors.OutOfRangeError:
                        print("Exhausted test data")
                        break
                delta_time = datetime.now() - start_time
                total_exp_time = delta_time * (total_steps / step)
                remaining_time = total_exp_time - delta_time
                print("")
                print("Test set BLEU Score:", np.mean(scores))
                print("Model cost:" ,model_cost)
                print("Step {} from {}".format(step, total_steps))
                print("Current time:", datetime.now())
                print("Total Experiment time (Hours:Minutes:Seconds):", str(total_exp_time))
                print("Time Elapled (Hours:Minutes:Seconds):", str(delta_time))
                print("Time remaining (Hours:Minutes:Seconds):", str(remaining_time))
                print("")

        except tf.errors.OutOfRangeError:
            print("Model finished training")
            break
    return save_path

Here is an output of a training run: command line output

Is there something wrong with the way I'm executing the graph, or am I repeating some step leading to the memory fill up?

Thanks for all your help!

I'd start by profiling the program with `Timeline` object: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/client/timeline.py Tutorial: https://towardsdatascience.com/howto-profile-tensorflow-1a49fb18073d — Lukasz Tracewski, Dec 02 '18 at 18:12

Tensorflow GPU Out Of Memory during runtime using dynamic_rnn

0 Answers0