I am trying to find the most similar match in a column of a pandas dataframe to an input string that is not in English (Swedish). This is what I have tried. I have encoded both my input string and the texts in the pandas' column and then I tried to calculate cosine similairy between the two:
# Load the pre-trained BERT model for Swedish language
tokenizer = AutoTokenizer.from_pretrained("KBLab/sentence-bert-swedish-cased")
model = AutoModel.from_pretrained("KBLab/sentence-bert-swedish-cased")
def find_similar(input_text, column_name, df):
# Encode the input text and the text in the column
encoded_input = tokenizer.encode(input_text, return_tensors='pt', truncation=True)
input_ids = encoded_input[0]
column_text = df[column_name].tolist()
column_ids = [tokenizer.encode(text, return_tensors='pt') for text in column_text]
# Calculate the similarity score between the input text and each text in the column
with torch.no_grad():
input_embeddings = model(encoded_input).last_hidden_state
column_embeddings = [model(column_id).last_hidden_state for column_id in column_ids]
similarity_scores = [cosine_similarity(input_embeddings, embedding).item() for embedding in column_embeddings]
# Find the index of the text with the highest similarity score
max_index = similarity_scores.index(max(similarity_scores))
# Return the most similar text
return column_text[max_index]
input_text = test
column_name = "DESC"
most_similar_text = find_similar(input_text, column_name, df)
print(most_similar_text)
But this throws the RuntimeError
:
RuntimeError Traceback (most recent call last)
Cell In [49], line 3
1 input_text = test
2 column_name = "DESC"
----> 3 most_similar_text = find_similar(input_text, column_name, df)
4 print(most_similar_text)
Cell In [48], line 12, in find_similar(input_text, column_name, df)
10 input_embeddings = model(encoded_input).last_hidden_state
11 column_embeddings = [model(column_id).last_hidden_state for column_id in column_ids]
---> 12 similarity_scores = [cosine_similarity(input_embeddings, embedding).item() for embedding in column_embeddings]
14 # Find the index of the text with the highest similarity score
15 max_index = similarity_scores.index(max(similarity_scores))
Cell In [48], line 12, in <listcomp>(.0)
10 input_embeddings = model(encoded_input).last_hidden_state
11 column_embeddings = [model(column_id).last_hidden_state for column_id in column_ids]
---> 12 similarity_scores = [cosine_similarity(input_embeddings, embedding).item() for embedding in column_embeddings]
14 # Find the index of the text with the highest similarity score
15 max_index = similarity_scores.index(max(similarity_scores))
RuntimeError: The size of tensor a (19) must match the size of tensor b (10) at non-singleton dimension 1
Is there a way to resolve this?