I am trying to create a smaller subset without row redundancy of the original data by measuring the cosine similarity for each feature vector for all query ID's. When I have these similarities, I want to select the rows that have a higher hand similarity than a given threshold and apply the following critirea: if for each relevance label there are more than 10 similar rows -> keep the first 10 in the dataset and remove the rest, and if there are less than 10 rows -> keep all of them. At last, train both datasets and compare their NDCG@10.
For example similarity matrix = tensor([[1.0000, 0.5000 0.9996], [0.5000, 1.0000, 0.9998], [0.9996, 0.9998, 1.0000]]) with threshold 0.85. This means that row one is similar to row three, row two is similar to row three. So row's one, two, and three are similar.
This is my code:
from pytorchltr.datasets import MSLR10K
train = MSLR10K(split="train", fold=1)
test = MSLR10K(split="test", fold=1)
import torch
class Model(torch.nn.Module):
def __init__(self, in_features):
super().__init__()
self.l1 = torch.nn.Linear(in_features, 50)
self.l2 = torch.nn.Linear(50, 10)
self.l3 = torch.nn.Linear(10, 1)
def forward(self, x):
o1 = torch.nn.functional.relu(self.l1(x))
o2 = torch.nn.functional.relu(self.l2(o1))
return self.l3(o2)
torch.manual_seed(42)
dimensionality = train[0].features.shape[1]
model = Model(dimensionality)
from pytorchltr.loss import PairwiseHingeLoss
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.1)
loss_fn = PairwiseHingeLoss()
from pytorchltr.datasets.list_sampler import UniformSampler
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
# Step 1-6: Create a new train dataset with less redundancy
redundancy_threshold = 0.95
max_redundant_rows = 10
query_relevance_dict = {}
for epoch in range(1, 21):
loader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True,
collate_fn=train.collate_fn(UniformSampler(max_list_size=20)))
for batch in loader:
xs, ys, n = batch.features, batch.relevance, batch.n
loss = loss_fn(model(xs), ys, n).mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Step 1: Calculate the pairwise cosine similarity matrix
similarity_matrix = F.cosine_similarity(xs.unsqueeze(1), xs.unsqueeze(0), dim=2)
for i in range(len(n)):
query_id = n[i].item()
relevance_label = tuple(ys[i].tolist()) # Convert tensor to tuple
relevance_label_rows = query_relevance_dict.setdefault(query_id {}).setdefault(relevance_label, [])
similarity_row = similarity_matrix[i]
# Step 2: Find similar rows based on pairwise similarity
similar_rows = torch.nonzero(similarity_row > redundancy_threshold).squeeze(dim=1)
# Step 3: Apply redundancy threshold and add similar rows
if len(similar_rows) > max_redundant_rows:
similar_rows = similar_rows[:max_redundant_rows]
relevance_label_rows.extend(similar_rows.tolist())
# Step 4-5: Remove excess redundant rows based on the criteria
num_rows_new_dataset = 0
for query_id, relevance_dict in query_relevance_dict.items():
for relevance_label, rows in relevance_dict.items():
if len(rows) > max_redundant_rows:
rows = rows[:max_redundant_rows] # Update the rows list directly
query_relevance_dict[query_id][relevance_label] = rows
num_rows_new_dataset += len(rows)
print("Number of rows in the new dataset:", num_rows_new_dataset)
# Clear the query_relevance_dict after each epoch
query_relevance_dict.clear()
print("Finished epoch %d" % epoch)
# Step 7: Evaluate both the original and new models using NDCG from pytorchltr.evaluation import ndcg
loader = torch.utils.data.DataLoader(
test, batch_size=16, collate_fn=test.collate_fn())
original_model_scores = 0.0
new_model_scores = 0.0
for batch in loader:
xs, ys, n = batch.features, batch.relevance, batch.n
original_model_ndcg_score = ndcg(model(xs), ys, n, k=10)
new_model = Model(dimensionality) # Create a new instance of the model
new_model_ndcg_score = ndcg(new_model(xs), ys, n, k=10)
original_model_scores += float(torch.sum(original_model_ndcg_score))
new_model_scores += float(torch.sum(new_model_ndcg_score))
original_model_ndcg = original_model_scores / len(test)
new_model_ndcg = new_model_scores / len(test)
print("ndcg@10 on original model: %f" % original_model_ndcg)
print("ndcg@10 on new model: %f" % new_model_ndcg)
My code is giving the same NDCG@10 for a threshold of both 0.5 and 9.5, and it’s also removing the same amount of rows with each threshold. So I know there's got to be something wrong / missing.. Can anyone help me?