Why does my VQA network perform so poorly?

Question

I'm trying to build a Visual Question Answering model using the Clevr dataset (https://cs.stanford.edu/people/jcjohns/clevr/) for a deep learning homework (I can't use the functional program representation, though).

However, I'm struggling, as my network doesn't learn properly, as it continues oscillating around 0.2 accuracy on both the training and validation set. Moreover, the first time I had run it, it had gone up to 0.4 accuracy with a similar architecture, but, I suspect, something different which had remained in memory from trials with the embedding matrix construction and the tokenizer.

I have already tried changing the embedding (now I'm using Glove), changing dimensions, changing the network in multiple ways (I yet have to try attention and more advanced things, but first I would like to see this working decently). I am sure there is some sort of fatal error (and the architecture is naive, too), but I can't seem to spot it. Could you please help me understand what is not working?

I'll leave my code for the network and the data input pipeline below, please comment even to point out where I am using some bad practices. I'm sorry if I will leave a lot of code, but I really can't understand where I'm doing wrong.

Thank you in advance.

Here's the code for the network

import tensorflow as tf

batch_size = 8
epochs = 100

#arch =  tf.keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
arch =  tf.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))

freeze_until = 650
for layer in arch.layers[:freeze_until]:
      layer.trainable = False
branch1 = arch.output
branch1 = tf.keras.layers.GlobalAveragePooling2D(data_format=None) (branch1)



text_inputs = tf.keras.Input(shape=[max_words])

emb = tf.keras.layers.Embedding(vocab_size,embedding_dim, 
                           input_length=max_words, 
                           weights=[embedding_matrix], 
                           trainable=False)  (text_inputs)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(emb)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
#branch2 = tf.keras.layers.Conv1D(128, 5, activation='relu')(emb)
branch2 = tf.keras.layers.LSTM(128)(emb)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(branch2)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)


joint = tf.keras.layers.concatenate([branch1, branch2])
joint = tf.keras.layers.Dense(512, activation='relu')(joint)
joint = tf.keras.layers.Dropout(0.2)(joint)
predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint)

model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions])

model.summary()

loss = tf.keras.losses.CategoricalCrossentropy()
lr = 5e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

model.compile(loss = loss,
                   optimizer = optimizer,
                   metrics = ['accuracy'])

callbacks=[]
callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True))
callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))
model.fit_generator(data_generator('train'),validation_data = data_generator('validation'), steps_per_epoch= 240, validation_steps = 120, epochs=epochs, callbacks=callbacks,  verbose=1)

Here's the code for the generator + embedding

import json
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

img_h = 320
img_w = 480 
max_words = 100
embedding_dim = 40
num_classes = 13
val_split = 0.8
max_len = 25

classes = [ '0',
           '1',
           '10',
           '2',
           '3',
           '4',
           '5',
           '6',
           '7',
           '8',
           '9',
           'no',
           'yes'
         ]



label_encoder = LabelEncoder()
integer_encoder_ = label_encoder.fit(classes)
integer_encoded = integer_encoder_.transform(classes)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder_ = onehot_encoder.fit(integer_encoded)


def data_generator(mode, batch_size = 8):


   with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
         data_raw = json.load(f)
   f.close()
   while True:
       # Select files (paths/indices) for the batch

       if mode == 'validation':
           batch_addresses = random.sample(range(int(len(data_raw['questions'])*val_split),len(data_raw['questions'])), batch_size)
       elif mode == 'train':
           batch_addresses = random.sample(range(0, int(len(data_raw['questions'])*val_split)), batch_size)
       else:
            batch_addresses = random.sample(range(0, len(data_raw['questions'])), batch_size)

       batch_input_img = []
       batch_input_txt = []
       batch_output = [] 

       for i in batch_addresses:

           image_name = data_raw['questions'][i]['image_filename']
           img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/' + image_name).convert('RGB')
           img_array = np.array(img)
           img_array = np.expand_dims(img_array, 0)
           input_img = np.true_divide(img_array,255)

           input_txt = data_raw['questions'][i]['question']

           output = data_raw['questions'][i]['answer']

           batch_input_img += [ input_img ]
           batch_input_txt += [ input_txt ]

           # Return a tuple of (input,output) to feed the network
           batch_x_img = np.array( batch_input_img )
           batch_x_txt = np.array( batch_input_txt )

       batch_x_img = batch_x_img[:,-1]    

       tokenized = tokenizer.texts_to_sequences(batch_x_txt)
       batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) 

       batch_output += [ output ]
       batch_y = np.array( batch_output )
       y_c = integer_encoder_.transform(batch_y)
       y_c = y_c.reshape(len(y_c), 1)
       onehot_encoded = onehot_encoder_.transform(y_c)

       batch_y = onehot_encoded



       yield ([batch_x_img,batch_x_txt], batch_y )

def test_generator():


   with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f:
         data_raw = json.load(f)
   f.close()
   i = 0
   while (i<=len(data_raw['questions'])):

       batch_input_img = []
       batch_input_txt = []
       batch_output = [] 

       image_name = data_raw['questions'][i]['image_filename']
       img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/' + image_name).convert('RGB')
       img_array = np.array(img)
       img_array = np.expand_dims(img_array, 0)
       input_img = np.true_divide(img_array,255)

       input_txt = data_raw['questions'][i]['question']

       output = data_raw['questions'][i]['question_id']

       batch_input_img += [ input_img ]
       batch_input_txt += [ input_txt ]

       # Return a tuple of (input,output) to feed the network
       batch_x_img = np.array( batch_input_img )
       batch_x_txt = np.array( batch_input_txt )

       batch_x_img = batch_x_img[:,-1]    

       tokenized = tokenizer.texts_to_sequences(batch_x_txt)
       batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) 

       batch_y = output

       i+=1



       yield ([batch_x_img,batch_x_txt], batch_y )


def create_embedding_matrix(filepath, word_index, embedding_dim):
   vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
   embedding_matrix = np.zeros((vocab_size, embedding_dim))

   with open(filepath) as f:
       count = 0
       for line in f:
           word, *vector = line.split()
           if word in word_index and count<(len(word_index)-1):
               idx = word_index[word] 
               embedding_matrix[idx] = np.array(
                   vector, dtype=np.float32)[:embedding_dim]
               count = count + 1
   # errore perché va a splittare e trova to name.domain

   return embedding_matrix

def create_tokens(tokenizer):
   with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
         data_raw = json.load(f)
   f.close()
   tot_txt = []
   for i in range(len(data_raw['questions'])):
       input_txt = data_raw['questions'][i]['question']
       tot_txt += [input_txt]

   tokenizer.fit_on_texts(tot_txt)
   return tokenizer

tokenizer = Tokenizer(num_words=max_words,oov_token = 'OOV')
tokenizer = create_tokens(tokenizer)
#embedding_matrix = create_embedding_matrix('/kaggle/input/embedding/embedding.txt', tokenizer.word_index, embedding_dim)
import os
filepath = "../input/glove840b300dtxt/" + os.listdir("../input/glove840b300dtxt/")[0]

embedding_matrix = create_embedding_matrix(filepath, tokenizer.word_index, embedding_dim)
vocab_size = len(tokenizer.word_index) + 1

reader = data_generator('train')

PS I thought changing the GlobalAveragePooling layer to a Flattening layer might solve it, but it didn't.

score 0 · Answer 1 · answered Jan 07 '20 at 12:22

I don't think it's interesting for many, but I've found the mistake. Inside the generator, I had left batch_output += [ output ] out of the for loop which looped through the batch, so, as Python is accommodating, I didn't get a mistake, but all the batch's questions had the same answer, which of course was wrong for almost any question. This led the network to stay in the local minimum corresponding to the most present answer in the training data. Now, although it doesn't perform great, it does its job and obtains ~0.5 accuracy on the Clevr dataset.

Why does my VQA network perform so poorly?

1 Answers1