0

For pedagogic purposes, I would like to implement a custom dense relu layer (without biais) using Tensorflow. The idea is to clearly see the backpropagation both for the inputs and for the synaptic weights. The custom gradient is thus not on the level of an activation function but at the level of the call function. I did a first try with a regression problem that I show you below for verification/confirmation if it is correct. In this regression problem, I have n_output neurons of output. I separate a random dataset into n_output classes. For a given input of class j (class_j), the output_neuron_i should respond with a value 1-abs(output_neuron_i - class_j)/n_output. I checked with standart relu and my custom relu and both are showing the same behavior, so I am quite confident that the formula are corrects but I am not entirely sure.

import tensorflow as tf
import os
import numpy as np
import matplotlib.pyplot as plt

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

class CustomReLU(tf.keras.layers.Layer):
    
    def __init__(self, units):
        super(CustomReLU, self).__init__()
        self.units = units
        self.W = None

    def build(self, input_shape):
        self.W = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer=kernel_initializer,
            trainable=True,
        )
        
    @tf.custom_gradient
    def call(self, inputs): #shape=[Batch, n_X]
        
        self.S = tf.matmul(inputs, self.W) #shape=[Batch, n_Y]               
        self.Y = tf.nn.relu(self.S) #shape=[Batch, n_Y]  

        #print("self.Y.shape", self.Y.shape)                
        
        def custom_grad(dE_dY, variables):
            
            dY_dS = tf.where(self.S > 0, tf.ones_like(self.S), tf.zeros_like(self.S)) #shape=[Batch, n_Y]
            dE_dS  = dE_dY*dY_dS #shape=[Batch, n_Y]
            dS_dX = tf.transpose(self.W) #shape=[n_Y, n_X]
            dE_dX = tf.matmul(dE_dS, dS_dX) #shape=[Batch, n_X]
                        
            modif_dY_dS = tf.expand_dims(dY_dS, [1]) #shape=[Batch, 1, n_Y]
            modif_inputs = tf.expand_dims(inputs, [2]) #shape=[Batch, n_X, 1]                    
            modif_dY_dW = modif_dY_dS*modif_inputs #shape=[Batch, n_X, n_Y]                    
            modif_dE_dY = tf.expand_dims(dE_dY, [1]) #shape=[Batch, 1, n_Y]
            dE_dW = modif_dE_dY*modif_dY_dW #shape=[Batch, n_X, n_Y]
            dE_dW = tf.reduce_sum(dE_dW, 0, keepdims=False) #shape=[n_X, n_Y]
                       
            return dE_dX, [dE_dW] #shape=[Batch, n_X], shape=[n_X, n_Y]
            
        return self.Y, custom_grad
    
def create_layer(units, layer_type="custom"):    
    if layer_type=="custom":
        return CustomReLU(units)
    else:            
        return tf.keras.layers.Dense(units, activation="relu", use_bias=False, kernel_initializer=kernel_initializer)

nb_hidden_layer = 1

n_input = 100
n_output = 100
size_dataset = n_output*10

nb_hidden_unit = 100

mean_input =0.1
var_input = 1.0

learning_rate = 0.1

kernel_initializer = "glorot_normal"

nb_epoch = 10000


input_vals = tf.random.normal( [size_dataset, n_input], mean=mean_input, stddev=tf.math.sqrt(var_input), dtype=tf.dtypes.float32)
target_vals = np.zeros( shape=(size_dataset, n_output), dtype='float32')
nb_sample_per_class = int(size_dataset/n_output)

for id_output_neuron in range(n_output):    
    for id_class in range(n_output):        
        diff_neuron_class = np.abs(id_output_neuron-id_class)            
        target_vals[id_class*nb_sample_per_class:(id_class+1)*nb_sample_per_class, id_output_neuron] = 1.-diff_neuron_class*(1./n_output)          
    
#print("target_vals", target_vals)
    
target_vals = tf.convert_to_tensor(target_vals)


model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(n_input)))
layer_type="custom"
for id_hidden_layer in range(nb_hidden_layer):
    model.add(create_layer(nb_hidden_unit, layer_type=layer_type))
model.add(create_layer(n_output, layer_type=layer_type))


loss_obj = tf.keras.losses.MeanSquaredError()
opt_obj = tf.keras.optimizers.SGD(learning_rate)
model.compile(loss=loss_obj, optimizer=opt_obj, run_eagerly = True)

l_loss = []

epoch_range = np.arange(0, nb_epoch)

for i in epoch_range:
    print(i, "/",nb_epoch)
    
    outputs= model(input_vals)
    loss = loss_obj(target_vals, outputs)
    l_loss.append(loss)

    model.fit(input_vals, target_vals, epochs=1, batch_size=size_dataset, shuffle=True)

plt.plot(epoch_range, l_loss, label="loss")
plt.ylim(bottom=0)
plt.legend()
plt.grid()
plt.show()

The part "modif..." that computes the gradient of the synaptic weights dE_dW in the custom_grad function does not satisfy me. I think it is inelegant and potentially inefficient. Is there someone who can provide a better solution ?

MadMax2048
  • 21
  • 2

0 Answers0