Tensorflow 2 Metrics produce wrong results with 2 GPUs

Question

I took this piece of code from tensorflow documentation about distributed training with custom loop https://www.tensorflow.org/tutorials/distribute/custom_training and I just fixed it to work with the tf.keras.metrics.AUC and run it with 2 GPUS (2 Nvidia V100 from a DGX machine).

# Import TensorFlow
import tensorflow as tf

# Helper libraries
import numpy as np


print(tf.__version__)


fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

# Adding a dimension to the array -> new shape == (28, 28, 1)
# We are doing this because the first layer in our model is a convolutional
# layer and it requires a 4D input (batch_size, height, width, channels).
# batch_size dimension will be added later on.
train_images = train_images[..., None]
test_images = test_images[..., None]

# One hot
train_labels = tf.keras.utils.to_categorical(train_labels, 10)
test_labels = tf.keras.utils.to_categorical(test_labels, 10)

# Getting the images in [0, 1] range.
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

# If the list of devices is not specified in the
# `tf.distribute.MirroredStrategy` constructor, it will be auto-detected.
GPUS = [0, 1]
devices = ["/gpu:" + str(gpu_id) for gpu_id in GPUS]
strategy = tf.distribute.MirroredStrategy(devices=devices)

print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))


BUFFER_SIZE = len(train_images)

BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10


train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(GLOBAL_BATCH_SIZE)

train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)


def create_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(64, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

  return model


with strategy.scope():
  # Set reduction to `none` so we can do the reduction afterwards and divide by
  # global batch size.
  loss_object = tf.keras.losses.CategoricalCrossentropy(
      from_logits=True,
      reduction=tf.keras.losses.Reduction.NONE)
  def compute_loss(labels, predictions):
    per_example_loss = loss_object(labels, predictions)
    return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)


with strategy.scope():
  test_loss = tf.keras.metrics.Mean(name='test_loss')

  train_accuracy = tf.keras.metrics.CategoricalAccuracy(
      name='train_accuracy')
  test_accuracy = tf.keras.metrics.CategoricalAccuracy(
      name='test_accuracy')
  train_auc = tf.keras.metrics.AUC(name='train_auc')
  test_auc = tf.keras.metrics.AUC(name='test_auc')


# model, optimizer, and checkpoint must be created under `strategy.scope`.
with strategy.scope():
  model = create_model()

  optimizer = tf.keras.optimizers.Adam()


def train_step(inputs):
  images, labels = inputs

  with tf.GradientTape() as tape:
    predictions = model(images, training=True)
    loss = compute_loss(labels, predictions)

  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  train_accuracy(labels, predictions)
  train_auc(labels, predictions)
  return loss

def test_step(inputs):
  images, labels = inputs

  predictions = model(images, training=False)
  t_loss = loss_object(labels, predictions)

  test_loss.update_state(t_loss)
  test_accuracy(labels, predictions)
  test_auc(labels, predictions)


# `run` replicates the provided computation and runs it
# with the distributed input.
@tf.function
def distributed_train_step(dataset_inputs):
  per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
  return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                         axis=None)

@tf.function
def distributed_test_step(dataset_inputs):
  return strategy.run(test_step, args=(dataset_inputs,))


for epoch in range(EPOCHS):
  # TRAIN LOOP
  total_loss = 0.0
  num_batches = 0
  for x in train_dist_dataset:
    total_loss += distributed_train_step(x)
    num_batches += 1
  train_loss = total_loss / num_batches

  # TEST LOOP
  for x in test_dist_dataset:
    distributed_test_step(x)

  template = ("Epoch {}, Loss: {}, Accuracy: {}, AUC: {},"
              "Test Loss: {}, Test Accuracy: {}, Test AUC: {}")
  print (template.format(epoch+1,
                         train_loss, train_accuracy.result()*100, train_auc.result()*100,
                         test_loss.result(), test_accuracy.result()*100, test_auc.result()*100))

  test_loss.reset_states()
  train_accuracy.reset_states()
  test_accuracy.reset_states()
  train_auc.reset_states()
  test_auc.reset_states()

The problem is that AUC's evaluation is definitely wrong cause it exceeds its range (should be from 0-100) and i get theese results by running the above code for one time:

Epoch 1, Loss: 1.8061423301696777, Accuracy: 66.00833892822266, AUC: 321.8688659667969,Test Loss: 1.742477536201477, Test Accuracy: 72.0999984741211, Test AUC: 331.33709716796875
Epoch 2, Loss: 1.7129968404769897, Accuracy: 74.9816665649414, AUC: 337.37017822265625,Test Loss: 1.7084736824035645, Test Accuracy: 75.52999877929688, Test AUC: 337.1878967285156
Epoch 3, Loss: 1.643971562385559, Accuracy: 81.83333587646484, AUC: 355.96209716796875,Test Loss: 1.6072628498077393, Test Accuracy: 85.3499984741211, Test AUC: 370.603759765625
Epoch 4, Loss: 1.5887378454208374, Accuracy: 87.27833557128906, AUC: 373.6204528808594,Test Loss: 1.5906082391738892, Test Accuracy: 87.13999938964844, Test AUC: 371.9998474121094
Epoch 5, Loss: 1.581775426864624, Accuracy: 88.0, AUC: 373.9468994140625,Test Loss: 1.5964380502700806, Test Accuracy: 86.68000030517578, Test AUC: 371.0227355957031
Epoch 6, Loss: 1.5764907598495483, Accuracy: 88.49166870117188, AUC: 375.2404479980469,Test Loss: 1.5832056999206543, Test Accuracy: 87.94000244140625, Test AUC: 373.41998291015625
Epoch 7, Loss: 1.5698528289794922, Accuracy: 89.19166564941406, AUC: 376.473876953125,Test Loss: 1.5770654678344727, Test Accuracy: 88.58000183105469, Test AUC: 375.5516662597656
Epoch 8, Loss: 1.564456820487976, Accuracy: 89.71833801269531, AUC: 377.8564758300781,Test Loss: 1.5792100429534912, Test Accuracy: 88.27000427246094, Test AUC: 373.1791687011719
Epoch 9, Loss: 1.5612279176712036, Accuracy: 90.02000427246094, AUC: 377.9949645996094,Test Loss: 1.5729509592056274, Test Accuracy: 88.9800033569336, Test AUC: 375.5257263183594
Epoch 10, Loss: 1.5562015771865845, Accuracy: 90.54000091552734, AUC: 378.9789123535156,Test Loss: 1.56815767288208, Test Accuracy: 89.3499984741211, Test AUC: 375.8636474609375

Accuracy is ok but it seems that it's the only one metric that behaves nice. I tried other metrics too but they are not evaluated correctly. It seems that the problems come when using more than one GPU, cause when I run this code with one GPU it produce the right results.

score 0 · Answer 1 · edited Mar 01 '22 at 00:28

0

When you use distributed strategy, the metric must be constructed and used inside the strategy.scope() block. So when you want to call the metric.result() method, remember to put it inside the with strategy.scope() block.

with strategy.scope():
  print(metric.result())

edited Mar 01 '22 at 00:28

Jeremy Caney

7,102
69
48
77

answered Feb 28 '22 at 02:44

cui

1
2

Tensorflow 2 Metrics produce wrong results with 2 GPUs

1 Answers1