0

I added a print to the "discriminator_loss" function to see what was going on. at first it will tell me the shape of both are 16. later it tells me the shape of "real_loss" is only 15 while the other stays 16. So far I have only tried lowering the batchsize's and increasing them by 1 ect. I have provided the most relevant parts of the code. I can provide the rest of the code if needed. I have no clue why this is happening and it breaks the code.

with strategy.scope():
  BATCH_SIZE = 16
  GLOBAL_BATCH_SIZE = 32#batchsize*# of gpus
  im_size = 256
  latent_size = 512
with strategy.scope():
  cross_entropy = tf.keras.losses.BinaryCrossentropy(
    from_logits=True,\
    reduction = tf.keras.losses.Reduction.NONE)

  #this is used to evaluate discriminators ability to discriminate
  def discriminator_loss(real_output, fake_output):
      real_loss = cross_entropy(tf.ones_like(real_output), real_output)#compares prediction to actual value of 1
      fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)#compares rediction to actual value of 0
      print(real_loss)
      print(fake_loss)
      total_loss = real_loss + fake_loss
      total_loss = total_loss/GLOBAL_BATCH_SIZE
      return total_loss


  #how well was generator able to trick discriminator
  def generator_loss(fake_output):
      gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)#compares predictions to the expected value 1 of a real image
      gen_loss = gen_loss / GLOBAL_BATCH_SIZE
      return gen_loss
with strategy.scope():
  EPOCHS = 80
  noise_dim = 512
  num_examples_to_generate = 32



# We will reuse this seed overtime (so it's easier)
# to visualize progress in the animated GIF)
with strategy.scope():
  def noise(n):
    return tf.random.normal([n, latent_size])

  def noiseImage(n):
    return tf.random.uniform([n, im_size, im_size, 1])
  #seed = tf.random.normal([num_examples_to_generate, noise_dim])



#seed used to generate image>the discriminator than classifies real images from training set and a set of generated images>loss is calculated and gradients are used to update the model
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
with strategy.scope():
  #@tf.function
  def train_step(images):
      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        g_loss = generator_loss(fake_output)#runs generator loss
        d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
            
      G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
      D_grads = disc_tape.gradient(d_loss, discriminator.trainable_variables)

      generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
      discriminator_optimizer.apply_gradients(zip(D_grads, discriminator.trainable_variables))

      #run g_optim twice to make sure d_loss doesn't go to zero
      with tf.GradientTape() as gen_tape:
        generated_imgs = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
        fake_output = discriminator(generated_imgs, training=True)
        g_loss = generator_loss(fake_output)

      G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
      generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))

      return g_loss, d_loss


  @tf.function
  def distributed_train_step(dist_dataset):
      per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
      total_g_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_g_losses,axis=0)
      total_d_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_d_losses,axis=0)
      return total_g_loss, total_d_loss


with strategy.scope():
  def train(dist_dataset, epochs):
    for epoch in range(epochs):
      start = time.time()
      for image_batch in dist_dataset:
        total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function


with strategy.scope():
  train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well

error and traceback

Traceback (most recent call last):
  File "C:\image generator\pixiv\#image generator.py", line 507, in <module>
    train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
  File "C:\image generator\pixiv\#image generator.py", line 441, in train
    total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
    result = self._call(*args, **kwds)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2419, in __call__
    graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2777, in _maybe_define_function
    graph_function = self._create_graph_function(args, kwargs)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2667, in _create_graph_function
    capture_by_value=self._capture_by_value),
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 981, in func_graph_from_py_func
    func_outputs = python_func(*func_args, **func_kwargs)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 441, in wrapped_fn
    return weak_wrapped_fn().__wrapped__(*args, **kwds)
  File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 968, in wrapper
    raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:

    C:\image generator\pixiv\#image generator.py:419 distributed_train_step  *
        per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
    C:\image generator\pixiv\#image generator.py:393 train_step  *
        d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
    C:\image generator\pixiv\#image generator.py:328 discriminator_loss  *
        total_loss = real_loss + fake_loss
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:984 binary_op_wrapper
        return func(x, y, name=name)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1276 _add_dispatch
        return gen_math_ops.add_v2(x, y, name=name)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py:483 add_v2
        "AddV2", x=x, y=y, name=name)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:595 _create_op_internal
        compute_device)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:3327 _create_op_internal
        op_def=op_def)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1817 __init__
        control_input_ops, op_def)
    C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1657 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 0 and 2 for '{{node replica_1/add}} = AddV2[T=DT_FLOAT](replica_1/binary_crossentropy_1/weighted_loss/Mul, replica_1/binary_crossentropy_2/weighted_loss/Mul)' with input shapes: [0], [2].
Will Mulcahey
  • 89
  • 1
  • 7
  • 1
    Please edit the question and add the error and traceback. – Aaron Keesing Sep 13 '20 at 05:26
  • When does it occur, and how are you batching the data? Because if it happens on the last batches, then it might be something to do with not dropping the last uneven batch from the dataset if the batch size doesn't evenly divide the number of instances. – Aaron Keesing Sep 13 '20 at 06:06
  • train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH_SIZE) dist_dataset = strategy.experimental_distribute_dataset(train_dataset) – Will Mulcahey Sep 13 '20 at 07:04
  • 1
    Try adding `drop_remainder=True` to `batch()` and see if that helps. If it does help then it indicates the issue is with batch sizes but I think a better method would be to change the loss to account for these. – Aaron Keesing Sep 13 '20 at 07:11
  • That was it, which is really weird because it worked before. Pretty much the only thing i didn't change was the dataset and the loss functions. – Will Mulcahey Sep 13 '20 at 07:31

1 Answers1

1

So according to comments the problem lies in unequal batch sizes, due to the final batch being smaller than the specified batch size. I believe this is due to this line:

generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)

where the constant size BATCH_SIZE is used, instead of the actual input shape of the batch, so that generated_images is of a different shape than images.

So one solution as mentioned is simply to use drop_remainder=True in batch(). However it might be better to get the generator to output images of the same shape as the input, so instead of passing BATCH_SIZE as argument to your noise generation functions, you should use the actual size of the input batch. So maybe using tf.shape(images)[0] would help. Alternatively, you could generate a fixed batch of images with BATCH_SIZE, and then simply discard any extra images, like

num_images = tf.shape(images)[0]
generated_images = generated_images[:num_images]
Aaron Keesing
  • 1,277
  • 10
  • 18
  • it needs to be an integer value num_imgs = images.shape.as_list() num_imgs = num_imgs[0] like so but this helped a lot thanks – Will Mulcahey Sep 13 '20 at 09:52