I am training a U-Net on a Google Cloud TPU. It works but the utilization is very low.
Due to the fact I can not upload the traced profile here (?), a screenshot of the most slowing part is here:
The output fusion is the most harming part. with 58% of time but just 12% utilization. The next time consuming part (9%) is "convolution" with a utilization of 74%. I am not really sure which operations I need to tweak to get a better utilization of the output fusion?
Below is my code for creating the U-Net, maybe there is a slow layer inside?:
class UNet:
def create(self, input, start_ch, depth, inc_rate,
dropout, batchnorm, maxpool, upconv, residual, leaky_relu_alpha):
with tf.variable_scope('Generator', reuse=tf.AUTO_REUSE):
o = self._level_block(input, start_ch, depth, inc_rate, dropout, batchnorm, maxpool, upconv, residual,
leaky_relu_alpha)
out_ch = input.shape[3]
o = tf.layers.conv2d(o, out_ch, 1)
o = tf.tanh(o)
return o
def _conv_block(self, m, dim, bn, res, leaky_relu_alpha, do=0):
n = tf.layers.conv2d(m, dim, 3, padding='same')
n = tf.nn.leaky_relu(n, alpha=leaky_relu_alpha)
n = tf.layers.batch_normalization(n) if bn else n
n = tf.layers.dropout(n, do) if do else n
n = tf.layers.conv2d(n, dim, 3, padding='same')
n = tf.nn.leaky_relu(n, alpha=leaky_relu_alpha)
n = tf.layers.batch_normalization(n)if bn else n
return tf.concat([m, n], axis=-1) if res else n
def _level_block(self, m, dim, depth, inc, do, bn, mp, up, res, leaky_relu_alpha):
if depth > 0:
n = self._conv_block(m, dim, bn, res, leaky_relu_alpha)
m = tf.layers.max_pooling2d(n, [2, 2], [2, 2]) if mp else tf.layers.conv2d(n, dim, 3, strides=2, padding='same')
m = self._level_block(m, int(inc * dim), depth - 1, inc, do, bn, mp, up, res, leaky_relu_alpha)
if up:
m = tf.image.resize_nearest_neighbor(m, (2*m.shape[1], 2*m.shape[2]))
m = tf.layers.conv2d(m, dim, 2, padding='same')
m = tf.nn.leaky_relu(m, alpha=leaky_relu_alpha)
else:
m = tf.layers.conv2d_transpose(m, dim, 3, strides=2, padding='same')
m = tf.nn.leaky_relu(m, alpha=leaky_relu_alpha)
n = tf.concat([n, m], axis=-1)
m = self._conv_block(n, dim, bn, res, leaky_relu_alpha)
else:
m = self._conv_block(m, dim, bn, res, leaky_relu_alpha, do)
return m
My input batch size is 128. U-Net depth is 4. Using no BatchNorm layers (batchnorm=False), conv2d_transpose (upconv=False), residual=False and maxpool=True. So U-Net consists just of Conv2D, Conv2D_Transpose, Dropout, Leaky ReLU, Max Pooling and Concatenation Layers.
Any Idea what I need to tweak to get a better "output fusion" utilization? Or at least what does affect the output fusion?