I'm trying to build a Visual Question Answering model using the Clevr dataset (https://cs.stanford.edu/people/jcjohns/clevr/) for a deep learning homework (I can't use the functional program representation, though).
However, I'm struggling, as my network doesn't learn properly, as it continues oscillating around 0.2 accuracy on both the training and validation set. Moreover, the first time I had run it, it had gone up to 0.4 accuracy with a similar architecture, but, I suspect, something different which had remained in memory from trials with the embedding matrix construction and the tokenizer.
I have already tried changing the embedding (now I'm using Glove), changing dimensions, changing the network in multiple ways (I yet have to try attention and more advanced things, but first I would like to see this working decently). I am sure there is some sort of fatal error (and the architecture is naive, too), but I can't seem to spot it. Could you please help me understand what is not working?
I'll leave my code for the network and the data input pipeline below, please comment even to point out where I am using some bad practices. I'm sorry if I will leave a lot of code, but I really can't understand where I'm doing wrong.
Thank you in advance.
Here's the code for the network
import tensorflow as tf
batch_size = 8
epochs = 100
#arch = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
arch = tf.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
freeze_until = 650
for layer in arch.layers[:freeze_until]:
layer.trainable = False
branch1 = arch.output
branch1 = tf.keras.layers.GlobalAveragePooling2D(data_format=None) (branch1)
text_inputs = tf.keras.Input(shape=[max_words])
emb = tf.keras.layers.Embedding(vocab_size,embedding_dim,
input_length=max_words,
weights=[embedding_matrix],
trainable=False) (text_inputs)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(emb)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
#branch2 = tf.keras.layers.Conv1D(128, 5, activation='relu')(emb)
branch2 = tf.keras.layers.LSTM(128)(emb)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(branch2)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
joint = tf.keras.layers.concatenate([branch1, branch2])
joint = tf.keras.layers.Dense(512, activation='relu')(joint)
joint = tf.keras.layers.Dropout(0.2)(joint)
predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint)
model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions])
model.summary()
loss = tf.keras.losses.CategoricalCrossentropy()
lr = 5e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss = loss,
optimizer = optimizer,
metrics = ['accuracy'])
callbacks=[]
callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True))
callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))
model.fit_generator(data_generator('train'),validation_data = data_generator('validation'), steps_per_epoch= 240, validation_steps = 120, epochs=epochs, callbacks=callbacks, verbose=1)
Here's the code for the generator + embedding
import json
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
img_h = 320
img_w = 480
max_words = 100
embedding_dim = 40
num_classes = 13
val_split = 0.8
max_len = 25
classes = [ '0',
'1',
'10',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
'no',
'yes'
]
label_encoder = LabelEncoder()
integer_encoder_ = label_encoder.fit(classes)
integer_encoded = integer_encoder_.transform(classes)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder_ = onehot_encoder.fit(integer_encoded)
def data_generator(mode, batch_size = 8):
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
while True:
# Select files (paths/indices) for the batch
if mode == 'validation':
batch_addresses = random.sample(range(int(len(data_raw['questions'])*val_split),len(data_raw['questions'])), batch_size)
elif mode == 'train':
batch_addresses = random.sample(range(0, int(len(data_raw['questions'])*val_split)), batch_size)
else:
batch_addresses = random.sample(range(0, len(data_raw['questions'])), batch_size)
batch_input_img = []
batch_input_txt = []
batch_output = []
for i in batch_addresses:
image_name = data_raw['questions'][i]['image_filename']
img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/' + image_name).convert('RGB')
img_array = np.array(img)
img_array = np.expand_dims(img_array, 0)
input_img = np.true_divide(img_array,255)
input_txt = data_raw['questions'][i]['question']
output = data_raw['questions'][i]['answer']
batch_input_img += [ input_img ]
batch_input_txt += [ input_txt ]
# Return a tuple of (input,output) to feed the network
batch_x_img = np.array( batch_input_img )
batch_x_txt = np.array( batch_input_txt )
batch_x_img = batch_x_img[:,-1]
tokenized = tokenizer.texts_to_sequences(batch_x_txt)
batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len)
batch_output += [ output ]
batch_y = np.array( batch_output )
y_c = integer_encoder_.transform(batch_y)
y_c = y_c.reshape(len(y_c), 1)
onehot_encoded = onehot_encoder_.transform(y_c)
batch_y = onehot_encoded
yield ([batch_x_img,batch_x_txt], batch_y )
def test_generator():
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
i = 0
while (i<=len(data_raw['questions'])):
batch_input_img = []
batch_input_txt = []
batch_output = []
image_name = data_raw['questions'][i]['image_filename']
img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/' + image_name).convert('RGB')
img_array = np.array(img)
img_array = np.expand_dims(img_array, 0)
input_img = np.true_divide(img_array,255)
input_txt = data_raw['questions'][i]['question']
output = data_raw['questions'][i]['question_id']
batch_input_img += [ input_img ]
batch_input_txt += [ input_txt ]
# Return a tuple of (input,output) to feed the network
batch_x_img = np.array( batch_input_img )
batch_x_txt = np.array( batch_input_txt )
batch_x_img = batch_x_img[:,-1]
tokenized = tokenizer.texts_to_sequences(batch_x_txt)
batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len)
batch_y = output
i+=1
yield ([batch_x_img,batch_x_txt], batch_y )
def create_embedding_matrix(filepath, word_index, embedding_dim):
vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath) as f:
count = 0
for line in f:
word, *vector = line.split()
if word in word_index and count<(len(word_index)-1):
idx = word_index[word]
embedding_matrix[idx] = np.array(
vector, dtype=np.float32)[:embedding_dim]
count = count + 1
# errore perché va a splittare e trova to name.domain
return embedding_matrix
def create_tokens(tokenizer):
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
tot_txt = []
for i in range(len(data_raw['questions'])):
input_txt = data_raw['questions'][i]['question']
tot_txt += [input_txt]
tokenizer.fit_on_texts(tot_txt)
return tokenizer
tokenizer = Tokenizer(num_words=max_words,oov_token = 'OOV')
tokenizer = create_tokens(tokenizer)
#embedding_matrix = create_embedding_matrix('/kaggle/input/embedding/embedding.txt', tokenizer.word_index, embedding_dim)
import os
filepath = "../input/glove840b300dtxt/" + os.listdir("../input/glove840b300dtxt/")[0]
embedding_matrix = create_embedding_matrix(filepath, tokenizer.word_index, embedding_dim)
vocab_size = len(tokenizer.word_index) + 1
reader = data_generator('train')
PS I thought changing the GlobalAveragePooling layer to a Flattening layer might solve it, but it didn't.