This is my code. I tried to build a VGG 11 layers network, with a mix of ReLu and ELu activation and many regularizations on kernels and activities. The result is really confusing: The code is at 10th epoch. My loss on both train and val have decreased from 2000 to 1.5, but my acc on both train and val remained the same at 50%. Can somebody explain to me?
# VGG 11
from keras.regularizers import l2
from keras.layers.advanced_activations import ELU
from keras.optimizers import Adam
model = Sequential()
model.add(Conv2D(64, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
input_shape=(1, 96, 96), activation='relu'))
model.add(Conv2D(64, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001),activity_regularizer=l2(0.0001),
activation='relu'))
model.add(Conv2D(128, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(256, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
activation='relu'))
model.add(Conv2D(256, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(512, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
activation='relu'))
model.add(Conv2D(512, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
activation='relu'))
model.add(Conv2D(512, (3, 3), kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.0001),
activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
# convert convolutional filters to flat so they can be feed to fully connected layers
model.add(Flatten())
model.add(Dense(2048, kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.01)))
model.add(ELU(alpha=1.0))
model.add(Dropout(0.5))
model.add(Dense(1024, kernel_initializer='he_normal',
kernel_regularizer=l2(0.0001), activity_regularizer=l2(0.01)))
model.add(ELU(alpha=1.0))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('softmax'))
adammo = Adam(lr=0.0008, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=adammo, metrics=['accuracy'])
hist = model.fit(X_train, y_train, batch_size=48, epochs=20, verbose=1, validation_data=(X_val, y_val))