Why loss is not decreasing in a Siamese BERT-Network training (Entity matching task)

Question

I'm trying to finetune a model for an entity matching task (kind of a sentence similarity task).

The idea is that if I give as input two sentences the model should output if they represent the same entity or not. I'm interested in the products' domain.

So for example:

sentences_left = ('logitech harmony 890 advanced universal remote control h890', 'sony silver digital voice recorder icdb600')
sentences_right = ('logitech harmony 890 advanced universal remote hdtv , tv , dvd player ( s ) , lighting , audio system 100 ft universal remote 966193-0403', 'canon black ef 70-300mm f/4 -5.6 is usm telephoto zoom lens 0345b002')

The output should be 1 for the first left-right pair of sentences and 0 for the second.

I want to test two approaches. The first is a sequence classification setup. So I take a pair of sentences, concat them with a [SEP] token in-between, encode it and feed it to BERT.

This approach kind of work, but I wanted to explore a second one that, in theory, should work too.

In few words, using mpnet as pre-trained language model I'm trying to implement this setup:

This is taken from the paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. The idea is to compute not only a single embedding as before, but two separate embeddings for each of the sentences. Then concatenate the embeddings and feeds them to a softmax classifier.

After lots of struggles I'm still unable to make it work, since the loss has no intention of decreasing. It starts at 0.25 and never goes up neither down.

I'm using the Abt-Buy, Amazon-Google and Walmart-Amazon datasets.

This is my model:


class FinalClassifier(nn.Module):

    def __init__(self, pos_neg=None, frozen=False):

        super(FinalClassifier, self).__init__()

        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")

        self.encoder = AutoModel.from_pretrained(
'all-mpnet-base-v2')

        if frozen:
          for param in self.encoder.parameters():
            param.requires_grad = False
        self.tokenizer = AutoTokenizer.from_pretrained(
'all-mpnet-base-v2')

        if pos_neg:
          self.criterion = BCEWithLogitsLoss(pos_weight=torch.Tensor([pos_neg]))

        self.linear = nn.Linear(3*768, 1)
        self.relu = nn.ReLu()


    def forward(self, texts_left, texts_right, labels=None):
       encoded_inputs_left = self.tokenizer(texts_left, padding='max_length',
                                        truncation=True, return_tensors='pt')
       encoded_inputs_left = encoded_inputs_left.to(self.device)
        
       output_left = self.encoder(**encoded_inputs_left)
       output_left = _mean_pooling(output_left, encoded_inputs_left['attention_mask'])
       # output_left = F.normalize(output_left, p=2, dim=1)

       encoded_inputs_right = self.tokenizer(texts_right, padding='max_length',
                                        truncation=True, return_tensors='pt')
       encoded_inputs_right = encoded_inputs_right.to(self.device)

       output_right = self.encoder(**encoded_inputs_right)
       output_right = _mean_pooling(output_right, encoded_inputs_right['attention_mask'])
       # output_right = F.normalize(output_right, p=2, dim=1)

       # Look at sBERT paper (u, v, |u-v|)
       pooled_output = torch.cat((output_left, output_right, torch.abs(output_left - output_right)), -1)


      linear_output = self.linear(pooled_output)
      relu_output = self.relu(linear_output)

      labels = labels.to(self.device)
      loss = self.criterion(linear_output.view(-1), labels.float())

      return (loss, relu_output)

Here's the Dataset


class FinalDataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [int(label) for label in df['label']]
        self.examples = df

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        examples = self.examples.iloc[idx]
        text_left = examples['text_left']
        text_right = examples['text_right']
        label = np.array(self.labels[idx])
        return text_left, text_right, label

and finally the training loop


def train(model, train, val, learning_rate=1e-6, epochs=5, batch_size=8):
  train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
  val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  optimizer = Adam(model.parameters(), lr= learning_rate)

  if use_cuda:
    model = model.cuda()

  for epoch_num in range(epochs):
    total_loss_train = 0
    tmp_loss = 0
    step = 0

    model.train()
    for i, data in enumerate(tqdm(train_dataloader)):
      left_batch, right_batch, labels = data

      (batch_loss, _) = model(left_batch, right_batch, labels)

      total_loss_train += batch_loss
      tmp_loss += batch_loss
      
      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

      # every 100 mini-batches
      if i % 100 == 99:
        print(f' Loss/train at epoch {epoch_num+1} (batch {i}): {tmp_loss/500}')
        writer.add_scalar('Loss/train',
                          tmp_loss / 100,
                          epoch_num * len(train_dataloader) + i)
        tmp_loss = 0

    total_loss_val = 0
    predictions = None
    total_labels = None
    step = 0

    model.eval()
    with torch.no_grad():

      for i, data in enumerate(val_dataloader):
        left_batch, right_batch, labels = data
        (batch_loss, linear_output) = model(left_batch, right_batch, labels)

        labels = labels.detach().cpu().numpy()
        linear_output = linear_output.detach().cpu().numpy()
        if predictions is None:
          predictions = np.where(linear_output>0.5, 1, 0)
          total_labels = labels
        else:
          predictions = np.append(predictions, np.where(linear_output>0.5, 1, 0), axis=0)
          total_labels = np.append(total_labels, labels, axis=0)

        total_loss_val += batch_loss.item()
        tmp_loss += batch_loss.item()

        # every 100 mini-batches
        if i % 100 == 99:
          print(f' Loss/val at epoch {epoch_num+1} (batch {i}): {tmp_loss/500}')
          writer.add_scalar('Loss/val',
                            tmp_loss / 100,
                            epoch_num * len(val_dataloader) + i)
          writer.add_scalar('F1/val',
                            f1_score(y_true=total_labels.flatten()[step:i], y_pred=predictions.flatten()[step:i]),
                            epoch_num * len(val_dataloader) + i)
          tmp_loss = 0
          step += 100
        
    f1 = f1_score(y_true=total_labels.flatten(), y_pred=predictions.flatten())
    report = classification_report(total_labels, predictions, zero_division=0)

    # plot all the pr curves
    for i in range(len([0, 1])):
      add_pr_curve_tensorboard(i, predictions.flatten(), total_labels.flatten())
    
    for name, p in model.named_parameters():
      writer.add_histogram(name, p, bins='auto')
    
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train): .3f} \
        | Val Loss: {total_loss_val / len(val): .3f} \
        | Val F1: {f1: .3f}')

    tqdm.write(report)


writer = SummaryWriter(log_dir=tensorboard_path)

EPOCHS = 5
  
LR = 1e-6
train_pos_neg_ratio = 9

model = FinalClassifier(train_pos_neg_ratio, frozen=False)
train_data, val_data = FinalDataset(df_train), FinalDataset(df_dev)

              
train(model, train_data, val_data, LR, EPOCHS)
writer.flush()
writer.close()

The issue is that the loss does NOT decrease, and the F1 accuracy as a result. I tried to normalize the outputs, add a dropout layer, analized the dataset to be sure that the problem wasn't there but now I ran out of ideas. An help would be extremely valuable.

Why loss is not decreasing in a Siamese BERT-Network training (Entity matching task)

0 Answers0