I'm trying to finetune a model for an entity matching task (kind of a sentence similarity task).
The idea is that if I give as input two sentences the model should output if they represent the same entity or not. I'm interested in the products' domain.
So for example:
sentences_left = ('logitech harmony 890 advanced universal remote control h890', 'sony silver digital voice recorder icdb600')
sentences_right = ('logitech harmony 890 advanced universal remote hdtv , tv , dvd player ( s ) , lighting , audio system 100 ft universal remote 966193-0403', 'canon black ef 70-300mm f/4 -5.6 is usm telephoto zoom lens 0345b002')
The output should be 1 for the first left-right pair of sentences and 0 for the second.
I want to test two approaches. The first is a sequence classification setup. So I take a pair of sentences, concat them with a [SEP] token in-between, encode it and feed it to BERT.
This approach kind of work, but I wanted to explore a second one that, in theory, should work too.
In few words, using mpnet as pre-trained language model I'm trying to implement this setup:
This is taken from the paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. The idea is to compute not only a single embedding as before, but two separate embeddings for each of the sentences. Then concatenate the embeddings and feeds them to a softmax classifier.
After lots of struggles I'm still unable to make it work, since the loss has no intention of decreasing. It starts at 0.25 and never goes up neither down.
I'm using the Abt-Buy, Amazon-Google and Walmart-Amazon datasets.
This is my model:
class FinalClassifier(nn.Module):
def __init__(self, pos_neg=None, frozen=False):
super(FinalClassifier, self).__init__()
use_cuda = torch.cuda.is_available()
self.device = torch.device("cuda" if use_cuda else "cpu")
self.encoder = AutoModel.from_pretrained(
'all-mpnet-base-v2')
if frozen:
for param in self.encoder.parameters():
param.requires_grad = False
self.tokenizer = AutoTokenizer.from_pretrained(
'all-mpnet-base-v2')
if pos_neg:
self.criterion = BCEWithLogitsLoss(pos_weight=torch.Tensor([pos_neg]))
self.linear = nn.Linear(3*768, 1)
self.relu = nn.ReLu()
def forward(self, texts_left, texts_right, labels=None):
encoded_inputs_left = self.tokenizer(texts_left, padding='max_length',
truncation=True, return_tensors='pt')
encoded_inputs_left = encoded_inputs_left.to(self.device)
output_left = self.encoder(**encoded_inputs_left)
output_left = _mean_pooling(output_left, encoded_inputs_left['attention_mask'])
# output_left = F.normalize(output_left, p=2, dim=1)
encoded_inputs_right = self.tokenizer(texts_right, padding='max_length',
truncation=True, return_tensors='pt')
encoded_inputs_right = encoded_inputs_right.to(self.device)
output_right = self.encoder(**encoded_inputs_right)
output_right = _mean_pooling(output_right, encoded_inputs_right['attention_mask'])
# output_right = F.normalize(output_right, p=2, dim=1)
# Look at sBERT paper (u, v, |u-v|)
pooled_output = torch.cat((output_left, output_right, torch.abs(output_left - output_right)), -1)
linear_output = self.linear(pooled_output)
relu_output = self.relu(linear_output)
labels = labels.to(self.device)
loss = self.criterion(linear_output.view(-1), labels.float())
return (loss, relu_output)
Here's the Dataset
class FinalDataset(torch.utils.data.Dataset):
def __init__(self, df):
self.labels = [int(label) for label in df['label']]
self.examples = df
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
examples = self.examples.iloc[idx]
text_left = examples['text_left']
text_right = examples['text_right']
label = np.array(self.labels[idx])
return text_left, text_right, label
and finally the training loop
def train(model, train, val, learning_rate=1e-6, epochs=5, batch_size=8):
train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
optimizer = Adam(model.parameters(), lr= learning_rate)
if use_cuda:
model = model.cuda()
for epoch_num in range(epochs):
total_loss_train = 0
tmp_loss = 0
step = 0
model.train()
for i, data in enumerate(tqdm(train_dataloader)):
left_batch, right_batch, labels = data
(batch_loss, _) = model(left_batch, right_batch, labels)
total_loss_train += batch_loss
tmp_loss += batch_loss
model.zero_grad()
batch_loss.backward()
optimizer.step()
# every 100 mini-batches
if i % 100 == 99:
print(f' Loss/train at epoch {epoch_num+1} (batch {i}): {tmp_loss/500}')
writer.add_scalar('Loss/train',
tmp_loss / 100,
epoch_num * len(train_dataloader) + i)
tmp_loss = 0
total_loss_val = 0
predictions = None
total_labels = None
step = 0
model.eval()
with torch.no_grad():
for i, data in enumerate(val_dataloader):
left_batch, right_batch, labels = data
(batch_loss, linear_output) = model(left_batch, right_batch, labels)
labels = labels.detach().cpu().numpy()
linear_output = linear_output.detach().cpu().numpy()
if predictions is None:
predictions = np.where(linear_output>0.5, 1, 0)
total_labels = labels
else:
predictions = np.append(predictions, np.where(linear_output>0.5, 1, 0), axis=0)
total_labels = np.append(total_labels, labels, axis=0)
total_loss_val += batch_loss.item()
tmp_loss += batch_loss.item()
# every 100 mini-batches
if i % 100 == 99:
print(f' Loss/val at epoch {epoch_num+1} (batch {i}): {tmp_loss/500}')
writer.add_scalar('Loss/val',
tmp_loss / 100,
epoch_num * len(val_dataloader) + i)
writer.add_scalar('F1/val',
f1_score(y_true=total_labels.flatten()[step:i], y_pred=predictions.flatten()[step:i]),
epoch_num * len(val_dataloader) + i)
tmp_loss = 0
step += 100
f1 = f1_score(y_true=total_labels.flatten(), y_pred=predictions.flatten())
report = classification_report(total_labels, predictions, zero_division=0)
# plot all the pr curves
for i in range(len([0, 1])):
add_pr_curve_tensorboard(i, predictions.flatten(), total_labels.flatten())
for name, p in model.named_parameters():
writer.add_histogram(name, p, bins='auto')
print(
f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train): .3f} \
| Val Loss: {total_loss_val / len(val): .3f} \
| Val F1: {f1: .3f}')
tqdm.write(report)
writer = SummaryWriter(log_dir=tensorboard_path)
EPOCHS = 5
LR = 1e-6
train_pos_neg_ratio = 9
model = FinalClassifier(train_pos_neg_ratio, frozen=False)
train_data, val_data = FinalDataset(df_train), FinalDataset(df_dev)
train(model, train_data, val_data, LR, EPOCHS)
writer.flush()
writer.close()
The issue is that the loss does NOT decrease, and the F1 accuracy as a result. I tried to normalize the outputs, add a dropout layer, analized the dataset to be sure that the problem wasn't there but now I ran out of ideas. An help would be extremely valuable.