0

Dear stackoverflow members,

I am currently trying to implement my own keras tuner training loop. In this loop I want to pass the input variable multiple times through the model in example:

Y = Startvalue
for i in range(x):
   Y = model(Y)

I want to see if this method creates more stable simulations for my self feedback problem. When I implement it I get an OOM error even when I do not loop. This error does not occur when I just do it normally. My Class example (the OOM error occurs when i switch logits for logits2:

class MyTuner(kt.Tuner):
    def run_trial(self, trial, train_ds, validation_data):

        model = self.hypermodel.build(trial.hyperparameters)

        optimizer = tf.keras.optimizers.Adam()
        epoch_loss_metric = tf.keras.metrics.MeanSquaredError()

        def microbatch(T_IN, A_IN, D_IN):
            OUT_T = []
            OUT_A = []
            for i in range(len(T_IN)):
                A_IN_R = tf.expand_dims(tf.squeeze(A_IN[i]), 0)
                T_IN_R = tf.expand_dims(tf.squeeze(T_IN[i]), 0)
                D_IN_R = tf.expand_dims(tf.squeeze(D_IN[i]), 0)
                (OUT_T_R, OUT_A_R) = model((A_IN_R, T_IN_R, D_IN_R))
                OUT_T.append(tf.squeeze(OUT_T_R))
                OUT_A.append(tf.squeeze(OUT_A_R))
            return(tf.squeeze(tf.stack(OUT_T)), tf.squeeze(tf.stack(OUT_A)))

        def run_train_step(data):
            T_IN = tf.dtypes.cast(data[0][0], 'float32')
            A_IN = tf.dtypes.cast(data[0][1], 'float32')
            D_IN = tf.dtypes.cast(data[0][2], 'float32')
            A_Ta = tf.dtypes.cast(data[1][0], 'float32')
            T_Ta = tf.dtypes.cast(data[1][1], 'float32')
            mse = tf.keras.losses.MeanSquaredError()

            with tf.GradientTape() as tape:
                logits2 = microbatch(T_IN, A_IN, D_IN)

                logits = model([A_IN, T_IN, D_IN])
                loss   = mse((T_Ta, A_Ta), logits2)
                # Add any regularization losses.
                if model.losses:
                    loss += tf.math.add_n(model.losses)
                gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            epoch_loss_metric.update_state((T_Ta, A_Ta), logits2)
            return loss

        for epoch in range(1000):
            print('Epoch: {}'.format(epoch))

            self.on_epoch_begin(trial, model, epoch, logs={})
            for batch, data in enumerate(train_ds):
                self.on_batch_begin(trial, model, batch, logs={})
                batch_loss = float(run_train_step(data))
                self.on_batch_end(trial, model, batch, logs={'loss': batch_loss})

                if batch % 100 == 0:
                    loss = epoch_loss_metric.result().numpy()
                    print('Batch: {}, Average Loss: {}'.format(batch, loss))

            epoch_loss = epoch_loss_metric.result().numpy()
            self.on_epoch_end(trial, model, epoch, logs={'loss': epoch_loss})
            epoch_loss_metric.reset_states()
    ````

1 Answers1

0

In my understanding, the micro-batch function is not implementing a self-feedback loop (though it does not affect the OOM)

I guess what's happening is that because you are computing the output of the network k times, the amount of memory consumption by the network is increasing by k times (because it needs to store intermediate tensors for backprop).

What you can do is, at each self-feedback instance, you backprop the gradients so that all the intermediate tensors do not increase beyond the limit.

lemme know if you have any doubt,

Karan Dhingra
  • 133
  • 11
  • 1
    Thank you for your answer yes currently there is no loop implementation but the current implementation is the basis for a loop implementation. I will try your approach. – Christian Pommer Feb 18 '21 at 14:34