I was trying to find a way to compare the test accuracy and test loss of different activation functions (such as tanh, sigmoid, relu), so I came up with this script:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from keras import models
from keras import layers
import tensorflow as tf
import keras.backend as K
from tqdm import tqdm
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
train_labels = tf.keras.utils.to_categorical(y_train)
test_labels = tf.keras.utils.to_categorical(y_test)
def test_activation_functions(act):
K.clear_session()
model = models.Sequential()
model.add(layers.Dense(512, activation=act, input_shape=(4,)))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(X_train, train_labels, epochs=20, batch_size=40 , verbose=0)
test_loss, test_acc = model.evaluate(X_test, test_labels, verbose=0)
del model
return test_loss , test_acc
def loop(act, iters):
test_losses = []
test_accs = []
for i in tqdm(range(iters)):
test_loss , test_acc = test_activation_functions(act)
test_losses.append(test_loss)
test_accs.append(test_acc)
return test_losses , test_accs
def plot_histograms(test_losses, test_accs):
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.hist(test_losses, bins=20)
plt.title(f"Average Test Losses {round(np.average(test_losses), 4)}, \n std:{round(np.std(test_losses),4)}")
plt.subplot(1,2,2)
plt.hist(test_accs, bins=20)
plt.title(f"Average Test Accuracy = {round(np.average(test_accs), 4)}, \n std = {round(np.std(test_accs), 4)}")
plt.show()
def main():
test_losses_relu , test_accs_relu = loop(tf.keras.activations.relu, 1000)
test_losses_sigmoid , test_accs_sigmoid = loop(tf.keras.activations.sigmoid, 1000)
test_losses_tanh , test_accs_tanh = loop(tf.keras.activations.tanh, 1000)
plot_histograms(test_losses_relu, test_accs_relu)
plot_histograms(test_losses_sigmoid, test_accs_sigmoid)
plot_histograms(test_losses_tanh, test_accs_tanh)
if __name__ == "__main__":
main()
Before I get to my main question: am I clearing the TensorFlow sessions correctly? or my code can be optimized further?
Now the main question I have is, is this implementation reasonable? For example, here are the results of the loop of 1000 iterations:
ReLU:
Sigmoid:
Tanh:
However, I don't understand why tanh is performing better than relu. (Maybe on a deeper model relu - due to vanishing gradient - is better than tanh? )
So, Is there a more systematic way to "benchmark" these activation functions?