I'm specifying a network with regularization from dropout. But I'm having trouble understanding how the dropout is being processed here. Specifically, why isn't the difference between the number of zeros before and after applying dropout exactly equal to the dropout proportion?
class DropoutDenseNetwork(tf.Module):
def __init__(self, name=None):
super(DropoutDenseNetwork, self).__init__(name=name)
self.dense_layer1 = Dense(32)
self.dropout = tf.keras.layers.Dropout(0.2)
self.dense_layer2 = Dense(10, activation=tf.identity)
@tf.function
def __call__(self, x, is_training):
embed = self.dense_layer1(x)
propn_zero_before = tf.reduce_mean(tf.cast(tf.equal(embed, 0.), tf.float32))
embed = self.dropout(embed, is_training)
propn_zero_after = tf.reduce_mean(tf.cast(tf.equal(embed, 0.), tf.float32))
tf.print('Zeros before and after:', propn_zero_before, "and", propn_zero_after)
output = self.dense_layer2(embed)
return output
if 'drop_dense_net' not in locals():
drop_dense_net = DropoutDenseNetwork()
drop_dense_net(tf.ones([1, 100]), tf.constant(True))