Low accuracy when fine-tuning BERT for question answering

Question

I'm trying to fine-tune CamemBERT (french version of Roberta) for question answering.

At the first I'm using CamemBERT model to generate the input embedding of question and text and a output linear layer to output the start and end logits that corresponds to the start and the end of the answer.

In the official results in the paper the performance for question answering is (88 %, 77%) of (F1 score, EM) but the results I get are (71%, 46%).

My question is why the results are not close enough ?

This is a part of the script that I'm using to train and evaluate the model on FQuAD dataset with the same hyper-parameters as the official model:

MAX_SEQ_LENGTH = 384
TRAIN_BATCH_SIZE = 12
n_epochs = 3
learning_rate = 3e-5
EVAL_BATCH_SIZE = 12
dropout = 0
BERT_TYPE =  "fmikaelian/camembert-base-fquad"

class CamemBERTQA(nn.Module):
   def __init__(self,bert_type, hidden_size, num_labels):
       super(CamemBERTQA, self).__init__()
       self.bert_type = bert_type
       self.hidden_size = hidden_size
       self.num_labels = num_labels
       self.camembert = AutoModel.from_pretrained(self.bert_type)
       self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)

   def forward(self, input_ids):
       output = self.camembert(input_ids = input_ids)[0]
       logits = self.qa_outputs(output)
       start_logits, end_logits = logits.split(1, dim=-1)
       start_logits = start_logits.squeeze(-1)
       end_logits = end_logits.squeeze(-1)
       outputs = (start_logits, end_logits,)
       return outputs

def train_eval_model(model, n_epochs, scheduler=None):
   train_lossess = []
   valid_lossess = []
   avg_train_losses = []
   avg_valid_losses = [] 
   res = []
   for epoch in trange(n_epochs):
       #######################################################################################
       ################################### train the model ###################################
       #######################################################################################
       model.train()
       for batch, d in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
           ids = d['ids']
           start_pos = d['start_pos']
           end_pos = d['end_pos']
           ids = ids.to(device, dtype = torch.long)
           start_pos = start_pos.to(device, dtype = torch.long)
           end_pos = end_pos.to(device, dtype = torch.long)
           optimizer.zero_grad()
           start_and_end_scores = model(ids) # Forward pass return start and end positions
           loss = loss_func(start_and_end_scores, start_pos, end_pos)
           loss.backward()
           optimizer.step()
           if scheduler is not None:
               scheduler.step()
           train_lossess.append(loss.item())
       ##########################################################################################    
       ################################### validate the model ###################################
       ##########################################################################################
       model.eval() 
       pred_s = None
       pred_e = None
       # eval_loss = 0.0
       # eval_steps = 0

       for batch, d in enumerate(eval_dataloader):
           ids = d['ids']
           start_pos = d['start_pos']
           end_pos = d['end_pos']

           ids = ids.to(device, dtype = torch.long)
           start_pos = start_pos.to(device, dtype = torch.long)
           end_pos = end_pos.to(device, dtype = torch.long)

           with torch.no_grad():
               start_and_end_scores = model(ids)
               loss = loss_func(start_and_end_scores, start_pos, end_pos)
               valid_lossess.append(loss.item())
           # eval_steps += 1
           if pred_s is None:
               pred_s = start_and_end_scores[0].detach().cpu().numpy()
               pred_e = start_and_end_scores[1].detach().cpu().numpy()
           else:
               pred_s = np.append(pred_s, start_and_end_scores[0].detach().cpu().numpy(), axis=0)
               pred_e = np.append(pred_e, start_and_end_scores[1].detach().cpu().numpy(), axis=0)
       pred_start = np.argmax(pred_s, axis=1)
       pred_end = np.argmax(pred_e, axis=1)
       res.append([pred_start,pred_end])
       train_loss = np.average(train_lossess)
       valid_loss = np.average(valid_lossess)
       avg_train_losses.append(train_loss)
       avg_valid_losses.append(valid_loss)
       epoch_len = len(str(n_epochs))
       print_msg = (f'[{epoch:>{epoch_len}}/{n_epochs:>{epoch_len}}] ' +
                    f'train_loss: {train_loss:.5f} ' +
                    f'valid_loss: {valid_loss:.5f}')
       print(print_msg)
       train_lossess = []
       valid_lossess = []
   return  model, avg_train_losses, avg_valid_losses, res

score 0 · Answer 1 · answered Jun 06 '20 at 18:08

0

Not sure your question will ever be answered. Look here for examples to learn from: https://github.com/huggingface/transformers/tree/master/examples/question-answering

answered Jun 06 '20 at 18:08

Oren

4,711
4
37
63

Low accuracy when fine-tuning BERT for question answering

1 Answers1