I try to modify a running CycleGAN from SingleGPU to tf.distribute.MirroredStrategy. Having tried several things like custom training loops, Question of jongsung park, adjustments after the Tensorflow Tutorial and several places of strategy.scope(). Yet I still get the following error.
Ausnahme: RuntimeError
in user code:
File "C:\Users\Einka\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1160, in train_function *
return step_function(self, iterator)
File "w:\300_Neural_Network\320_Unsupervised_GAN_CycleConsistency\CycleGAN_Custom_Trainingloop", line 490, in train_step *
G_loss, F_loss, F_X_loss, D_Y_loss = strategy.run(self.train_step_single, args=(self, batch_data))
RuntimeError: Method requires being in cross-replica context, use get_replica_context().merge_call()
File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
(G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)
During handling of the above exception, another exception occurred:
File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
(G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)
During handling of the above exception, another exception occurred:
File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_filejk9kpr6g.py", line 15, in tf__train_function
retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
(G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)
During handling of the above exception, another exception occurred:
File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
(G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)
File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_filejk9kpr6g.py", line 15, in tf__train_function
retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
File "W:\300_Neural_Network\320_Unsupervised_GAN_CycleConsistency\CycleGAN_Custom_Trainingloop", line 575, in <module>
cycle_gan_model.fit(
The original model is the CycleGan from keras.io. The Code posted below is also available on Colab
import random
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
#CONFIGURATION
EPOCHS = 1
BATCH_SIZE_PER_REPLICA = 1
BUFFER_SIZE = 256
#Tensorboard
logdir = "W:/300_Neural_Network/320_Unsupervised_GAN_CycleConsistency/logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()
# ----
# Distribution strategy
strategy = tf.distribute.MirroredStrategy()
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)
# Visual parameters
tfds.disable_progress_bar()
# autotune = tf.data.AUTOTUNE
# Load the horse-zebra dataset using tensorflow-datasets.
dataset, _ = tfds.load("cycle_gan/horse2zebra", with_info=True, as_supervised=True)
train_horses, train_zebras = dataset["trainA"], dataset["trainB"]
test_horses, test_zebras = dataset["testA"], dataset["testB"]
# Define the standard image size.
orig_img_size = (286, 286)
# Size of the random crops to be used during training.
input_img_size = (256, 256, 3)
# Weights initializer for the layers.
kernel_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02, seed=random.seed(random.random()))
# Gamma initializer for instance normalization.
gamma_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02, seed=random.seed(random.random()))
batch_size = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
def normalize_img(img):
img = tf.cast(img, dtype=tf.float32)
# Map values in the range [-1, 1]
return (img / 127.5) - 1.0
def preprocess_train_image(img, label):
# Random flip
img = tf.image.random_flip_left_right(img)
# Resize to the original size first
img = tf.image.resize(img, [*orig_img_size])
# Random crop to 256X256
img = tf.image.random_crop(img, size=[*input_img_size])
# Normalize the pixel values in the range [-1, 1]
img = normalize_img(img)
return img
def preprocess_test_image(img, label):
# Only resizing and normalization for the test images.
img = tf.image.resize(img, [input_img_size[0], input_img_size[1]])
img = normalize_img(img)
return img
def distribute_datasets(strategy, train_batches, test_batches):
### START CODE HERE ###
train_dist_dataset = strategy.experimental_distribute_dataset(train_batches)
test_dist_dataset = strategy.experimental_distribute_dataset(test_batches)
### END CODE HERE ###
return train_dist_dataset, test_dist_dataset
# Apply the preprocessing operations to the training data
train_horses = (
train_horses.map(preprocess_train_image)
.cache()
.shuffle(BUFFER_SIZE)
.batch(batch_size)
.prefetch(1)
)
train_zebras = (
train_zebras.map(preprocess_train_image)
.cache()
.shuffle(BUFFER_SIZE)
.batch(batch_size)
.prefetch(1)
)
# Apply the preprocessing operations to the test data
test_horses = (
test_horses.map(preprocess_test_image)
.cache()
.shuffle(BUFFER_SIZE)
.batch(batch_size)
)
test_zebras = (
test_zebras.map(preprocess_test_image)
.cache()
.shuffle(BUFFER_SIZE)
.batch(batch_size)
)
train_set, test_set = distribute_datasets(strategy, tf.data.Dataset.zip((train_horses, train_zebras)), tf.data.Dataset.zip((test_horses, test_zebras)))
class ReflectionPadding2D(layers.Layer):
"""Implements Reflection Padding as a layer.
Args:
padding(tuple): Amount of padding for the
spatial dimensions.
Returns:
A padded tensor with the same type as the input tensor.
"""
def __init__(self, padding=(1, 1), **kwargs):
self.padding = tuple(padding)
super(ReflectionPadding2D, self).__init__(**kwargs)
def call(self, input_tensor, mask=None):
padding_width, padding_height = self.padding
padding_tensor = [
[0, 0],
[padding_height, padding_height],
[padding_width, padding_width],
[0, 0],
]
return tf.pad(input_tensor, padding_tensor, mode="REFLECT")
def residual_block(
x,
activation,
kernel_initializer=kernel_init,
kernel_size=(3, 3),
strides=(1, 1),
padding="valid",
gamma_initializer=gamma_init,
use_bias=False,
):
dim = x.shape[-1]
input_tensor = x
x = ReflectionPadding2D()(input_tensor)
x = layers.Conv2D(
dim,
kernel_size,
strides=strides,
kernel_initializer=kernel_initializer,
padding=padding,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
x = activation(x)
x = ReflectionPadding2D()(x)
x = layers.Conv2D(
dim,
kernel_size,
strides=strides,
kernel_initializer=kernel_initializer,
padding=padding,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
x = layers.add([input_tensor, x])
return x
def downsample(
x,
filters,
activation,
kernel_initializer=kernel_init,
kernel_size=(3, 3),
strides=(2, 2),
padding="same",
gamma_initializer=gamma_init,
use_bias=False,
):
x = layers.Conv2D(
filters,
kernel_size,
strides=strides,
kernel_initializer=kernel_initializer,
padding=padding,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
if activation:
x = activation(x)
return x
def upsample(
x,
filters,
activation,
kernel_size=(3, 3),
strides=(2, 2),
padding="same",
kernel_initializer=kernel_init,
gamma_initializer=gamma_init,
use_bias=False,
):
x = layers.Conv2DTranspose(
filters,
kernel_size,
strides=strides,
padding=padding,
kernel_initializer=kernel_initializer,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
if activation:
x = activation(x)
return x
def get_resnet_generator(
filters=64,
num_downsampling_blocks=2,
num_residual_blocks=9,
num_upsample_blocks=2,
gamma_initializer=gamma_init,
name=None,
):
img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
x = ReflectionPadding2D(padding=(3, 3))(img_input)
x = layers.Conv2D(filters, (7, 7), kernel_initializer=kernel_init, use_bias=False)(
x
)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
x = layers.Activation("relu")(x)
# Downsampling
for _ in range(num_downsampling_blocks):
filters *= 2
x = downsample(x, filters=filters, activation=layers.Activation("relu"))
# Residual blocks
for _ in range(num_residual_blocks):
x = residual_block(x, activation=layers.Activation("relu"))
# Upsampling
for _ in range(num_upsample_blocks):
filters //= 2
x = upsample(x, filters, activation=layers.Activation("relu"))
# Final block
x = ReflectionPadding2D(padding=(3, 3))(x)
x = layers.Conv2D(3, (7, 7), padding="valid")(x)
x = layers.Activation("tanh")(x)
model = keras.models.Model(img_input, x, name=name)
return model
def get_discriminator(
filters=64, kernel_initializer=kernel_init, num_downsampling=3, name=None
):
img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
x = layers.Conv2D(
filters,
(4, 4),
strides=(2, 2),
padding="same",
kernel_initializer=kernel_initializer,
)(img_input)
x = layers.LeakyReLU(0.2)(x)
num_filters = filters
for num_downsample_block in range(3):
num_filters *= 2
if num_downsample_block < 2:
x = downsample(
x,
filters=num_filters,
activation=layers.LeakyReLU(0.2),
kernel_size=(4, 4),
strides=(2, 2),
)
else:
x = downsample(
x,
filters=num_filters,
activation=layers.LeakyReLU(0.2),
kernel_size=(4, 4),
strides=(1, 1),
)
x = layers.Conv2D(
1, (4, 4), strides=(1, 1), padding="same", kernel_initializer=kernel_initializer
)(x)
model = keras.models.Model(inputs=img_input, outputs=x, name=name)
return model
# Get the generators
gen_G = get_resnet_generator(name="generator_G")
gen_F = get_resnet_generator(name="generator_F")
# Get the discriminators
disc_X = get_discriminator(name="discriminator_X")
disc_Y = get_discriminator(name="discriminator_Y")
class CycleGan(keras.Model):
def __init__(
self,
generator_G,
generator_F,
discriminator_X,
discriminator_Y,
lambda_cycle=10.0,
lambda_identity=0.5,
):
super(CycleGan, self).__init__()
self.gen_G = generator_G
self.gen_F = generator_F
self.disc_X = discriminator_X
self.disc_Y = discriminator_Y
self.lambda_cycle = lambda_cycle
self.lambda_identity = lambda_identity
def compile(
self,
gen_G_optimizer,
gen_F_optimizer,
disc_X_optimizer,
disc_Y_optimizer,
gen_loss_fn,
disc_loss_fn,
cycle_loss_fn,
identity_loss_fn,
):
super(CycleGan, self).compile()
self.gen_G_optimizer = gen_G_optimizer
self.gen_F_optimizer = gen_F_optimizer
self.disc_X_optimizer = disc_X_optimizer
self.disc_Y_optimizer = disc_Y_optimizer
self.generator_loss_fn = gen_loss_fn
self.discriminator_loss_fn = disc_loss_fn
self.cycle_loss_fn = cycle_loss_fn
self.identity_loss_fn = identity_loss_fn
def __call__ ( self, batch_data ):
real_x, real_y = batch_data
genG = self.gen_G
genF = self.gen_F
return genG(real_x), genF(real_y)
def call ( self, batch_data ):
real_x, real_y = batch_data
genG = self.gen_G
genF = self.gen_F
return genG(real_x), genF(real_y)
def compute_output_shape(input_shape=(None, 256, 256, 3)):
return input_shape
def train_step_single(self, batch_data):
# with strategy.scope():
# x is Horse and y is zebra
real_x, real_y = batch_data
with tf.GradientTape(persistent=True) as tape:
# Horse to fake zebra
fake_y = self.gen_G(real_x, training=True)
# Zebra to fake horse -> y2x
fake_x = self.gen_F(real_y, training=True)
# Cycle (Horse to fake zebra to fake horse): x -> y -> x
cycled_x = self.gen_F(fake_y, training=True)
# Cycle (Zebra to fake horse to fake zebra) y -> x -> y
cycled_y = self.gen_G(fake_x, training=True)
# Identity mapping
same_x = self.gen_F(real_x, training=True)
same_y = self.gen_G(real_y, training=True)
# Discriminator output
disc_real_x = self.disc_X(real_x, training=True)
disc_fake_x = self.disc_X(fake_x, training=True)
disc_real_y = self.disc_Y(real_y, training=True)
disc_fake_y = self.disc_Y(fake_y, training=True)
# Generator adverserial loss
gen_G_loss = self.generator_loss_fn(disc_fake_y)
gen_F_loss = self.generator_loss_fn(disc_fake_x)
# Generator cycle loss
cycle_loss_G = self.cycle_loss_fn(real_y, cycled_y) * self.lambda_cycle
cycle_loss_F = self.cycle_loss_fn(real_x, cycled_x) * self.lambda_cycle
# Generator identity loss
id_loss_G = (
self.identity_loss_fn(real_y, same_y)
* self.lambda_cycle
* self.lambda_identity
)
id_loss_F = (
self.identity_loss_fn(real_x, same_x)
* self.lambda_cycle
* self.lambda_identity
)
# Total generator loss
total_loss_G = gen_G_loss + cycle_loss_G + id_loss_G
total_loss_F = gen_F_loss + cycle_loss_F + id_loss_F
# Discriminator loss
disc_X_loss = self.discriminator_loss_fn(disc_real_x, disc_fake_x)
disc_Y_loss = self.discriminator_loss_fn(disc_real_y, disc_fake_y)
# Get the gradients for the generators
grads_G = tape.gradient(total_loss_G, self.gen_G.trainable_variables)
grads_F = tape.gradient(total_loss_F, self.gen_F.trainable_variables)
# Get the gradients for the discriminators
disc_X_grads = tape.gradient(disc_X_loss, self.disc_X.trainable_variables)
disc_Y_grads = tape.gradient(disc_Y_loss, self.disc_Y.trainable_variables)
# Update the weights of the generators
self.gen_G_optimizer.apply_gradients(
zip(grads_G, self.gen_G.trainable_variables)
)
self.gen_F_optimizer.apply_gradients(
zip(grads_F, self.gen_F.trainable_variables)
)
# Update the weights of the discriminators
self.disc_X_optimizer.apply_gradients(
zip(disc_X_grads, self.disc_X.trainable_variables)
)
self.disc_Y_optimizer.apply_gradients(
zip(disc_Y_grads, self.disc_Y.trainable_variables)
)
return {
"G_loss": total_loss_G,
"F_loss": total_loss_F,
"D_X_loss": disc_X_loss,
"D_Y_loss": disc_Y_loss,
}
@tf.function
def train_step(self, batch_data):
G_loss, F_loss, F_X_loss, D_Y_loss = strategy.run(self.train_step_single, args=(self, batch_data))
return G_loss, F_loss, F_X_loss, D_Y_loss
assert tf.distribute.get_replica_context() is not None # default
class GANMonitor(keras.callbacks.Callback):
"""A callback to generate and save images after each epoch"""
def __init__(self, num_img=4):
self.num_img = num_img
def on_epoch_end(self, epoch, logs=None):
_, ax = plt.subplots(4, 2, figsize=(12, 12))
for i, img in enumerate(test_horses.take(self.num_img)):
prediction = self.model.gen_G(img)[0].numpy()
prediction = (prediction * 127.5 + 127.5).astype(np.uint8)
img = (img[0] * 127.5 + 127.5).numpy().astype(np.uint8)
ax[i, 0].imshow(img)
ax[i, 1].imshow(prediction)
ax[i, 0].set_title("Input image")
ax[i, 1].set_title("Translated image")
ax[i, 0].axis("off")
ax[i, 1].axis("off")
prediction = keras.preprocessing.image.array_to_img(prediction)
prediction.save(
"W:/300_Neural_Network/320_Unsupervised_GAN_CycleConsistency/plots/generated_img_{i}_{epoch}.png".format(i=i, epoch=epoch + 1)
)
plt.show()
plt.close()
# Loss function for evaluating adversarial loss
with strategy.scope():
adv_loss_fn = keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM)
# Define the loss function for the generators
def generator_loss_fn(fake):
fake_loss = adv_loss_fn(tf.ones_like(fake), fake)
return fake_loss
# Define the loss function for the discriminators
def discriminator_loss_fn(real, fake):
real_loss = adv_loss_fn(tf.ones_like(real), real)
fake_loss = adv_loss_fn(tf.zeros_like(fake), fake)
return (real_loss + fake_loss) * 0.5
# Create cycle gan model
with strategy.scope():
cycle_gan_model = CycleGan(
generator_G=gen_G, generator_F=gen_F, discriminator_X=disc_X, discriminator_Y=disc_Y
)
# Compile the model
with strategy.scope():
cycle_gan_model.compile(
gen_G_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
gen_F_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
disc_X_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
disc_Y_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
gen_loss_fn=generator_loss_fn,
disc_loss_fn=discriminator_loss_fn,
cycle_loss_fn=keras.losses.MeanAbsoluteError(reduction=tf.keras.losses.Reduction.SUM),
identity_loss_fn=keras.losses.MeanAbsoluteError(reduction=tf.keras.losses.Reduction.SUM),
)
# Callbacks
plotter = GANMonitor()
checkpoint_filepath = "W:/300_Neural_Network/320_Unsupervised_GAN_CycleConsistency/checkpoints/cyclegan_checkpoints.{epoch:03d}"
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
save_weights_only=True
)
# Here we will train the model for just one epoch as each epoch takes around
# 7 minutes on a single P100 backed machine.
cycle_gan_model.fit(
train_set,
epochs=1,
steps_per_epoch = 1067,
callbacks=[tensorboard_callback, plotter, model_checkpoint_callback],
)