Python code for selecting rows with higher cosine similarity in dataset

Question

I am trying to create a smaller subset without row redundancy of the original data by measuring the cosine similarity for each feature vector for all query ID's. When I have these similarities, I want to select the rows that have a higher hand similarity than a given threshold and apply the following critirea: if for each relevance label there are more than 10 similar rows -> keep the first 10 in the dataset and remove the rest, and if there are less than 10 rows -> keep all of them. At last, train both datasets and compare their NDCG@10.

For example similarity matrix = tensor([[1.0000, 0.5000 0.9996], [0.5000, 1.0000, 0.9998], [0.9996, 0.9998, 1.0000]]) with threshold 0.85. This means that row one is similar to row three, row two is similar to row three. So row's one, two, and three are similar.

This is my code:

from pytorchltr.datasets import MSLR10K
    train = MSLR10K(split="train", fold=1)
    test = MSLR10K(split="test", fold=1)
    import torch

class Model(torch.nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.l1 = torch.nn.Linear(in_features, 50)
        self.l2 = torch.nn.Linear(50, 10)
        self.l3 = torch.nn.Linear(10, 1)
    
    def forward(self, x):
        o1 = torch.nn.functional.relu(self.l1(x))
        o2 = torch.nn.functional.relu(self.l2(o1))
        return self.l3(o2)

torch.manual_seed(42)
dimensionality = train[0].features.shape[1]
model = Model(dimensionality)

from pytorchltr.loss import PairwiseHingeLoss
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.1)
loss_fn = PairwiseHingeLoss()

from pytorchltr.datasets.list_sampler import UniformSampler
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F

# Step 1-6: Create a new train dataset with less redundancy
redundancy_threshold = 0.95
max_redundant_rows = 10

query_relevance_dict = {}

for epoch in range(1, 21):
    loader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True,
        collate_fn=train.collate_fn(UniformSampler(max_list_size=20)))
    
    for batch in loader:
        xs, ys, n = batch.features, batch.relevance, batch.n
        loss = loss_fn(model(xs), ys, n).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Step 1: Calculate the pairwise cosine similarity matrix
        similarity_matrix = F.cosine_similarity(xs.unsqueeze(1), xs.unsqueeze(0), dim=2)
        
        for i in range(len(n)):
            query_id = n[i].item()
            relevance_label = tuple(ys[i].tolist())  # Convert tensor to tuple
            relevance_label_rows = query_relevance_dict.setdefault(query_id      {}).setdefault(relevance_label, [])
            similarity_row = similarity_matrix[i]
            
            # Step 2: Find similar rows based on pairwise similarity
            similar_rows = torch.nonzero(similarity_row > redundancy_threshold).squeeze(dim=1)
            
            # Step 3: Apply redundancy threshold and add similar rows
            if len(similar_rows) > max_redundant_rows:
                similar_rows = similar_rows[:max_redundant_rows]
            relevance_label_rows.extend(similar_rows.tolist())



    # Step 4-5: Remove excess redundant rows based on the criteria
    num_rows_new_dataset = 0

    for query_id, relevance_dict in query_relevance_dict.items():
        for relevance_label, rows in relevance_dict.items():
            if len(rows) > max_redundant_rows:
                rows = rows[:max_redundant_rows]  # Update the rows list directly
            
            query_relevance_dict[query_id][relevance_label] = rows
            num_rows_new_dataset += len(rows)

    print("Number of rows in the new dataset:", num_rows_new_dataset)


    
    # Clear the query_relevance_dict after each epoch
    query_relevance_dict.clear()
    
    print("Finished epoch %d" % epoch)
    # Step 7: Evaluate both the original and new models using NDCG from pytorchltr.evaluation    import ndcg

loader = torch.utils.data.DataLoader(
    test, batch_size=16, collate_fn=test.collate_fn())

original_model_scores = 0.0
new_model_scores = 0.0

for batch in loader:
    xs, ys, n = batch.features, batch.relevance, batch.n
    original_model_ndcg_score = ndcg(model(xs), ys, n, k=10)
      
    new_model = Model(dimensionality)  # Create a new instance of the model
    new_model_ndcg_score = ndcg(new_model(xs), ys, n, k=10)
    
    original_model_scores += float(torch.sum(original_model_ndcg_score))
    new_model_scores += float(torch.sum(new_model_ndcg_score))

original_model_ndcg = original_model_scores / len(test)
new_model_ndcg = new_model_scores / len(test)

print("ndcg@10 on original model: %f" % original_model_ndcg)
print("ndcg@10 on new model: %f" % new_model_ndcg)

My code is giving the same NDCG@10 for a threshold of both 0.5 and 9.5, and it’s also removing the same amount of rows with each threshold. So I know there's got to be something wrong / missing.. Can anyone help me?

Python code for selecting rows with higher cosine similarity in dataset

0 Answers0