I'm currently learning TF-IDF vectorizer and was trying to implement TD-IDF from scratch for better understanding. I created the below class for implementing TF-IDF vectorizer.
The below class basically has two major methods. the fit() method finds out he unique words from the document corpus and calculates the associated IDF values for these elements. The transform() method generates the td-idf vectorizer for the documents in the corpus.
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np
class TFIDFVectoriser_:
# class initialisation
def __init__(self):
"""This is the initialisation method of the TFIDFVectoriser class"""
self.__uniqueSet = set() # Set to store the unique words form the document corpus
self.__uniqueVocab = {} # Dict to store the unique words form the document corpus as keys and column nunber as values
self.__rows = [] # For storing index of the document
self.__columns =[] # For storing the dimensions of the word
self.__values = [] # For storing the freqeny of the word
#Fit method
def fit(self,dataset):
"""This method will return the set of unique words from the input list and will map them with a column number"""
if isinstance(dataset,list):
for data in tqdm(dataset): # accessing each row in the dataset
for w in data.split(" "): # accessing each word in each row
if len(w) < 2: # excluding words/stopwords which are mostly of lenght less than 2
continue
self.__uniqueSet.add(w)
self.__uniqueSet = sorted(list(self.__uniqueSet)) # storing the final set of unique words in a srted list
self.__uniqueVocab = {j:i for i,j in enumerate(self.__uniqueSet)} # creating a dict() of unique elements and column numbers
__idfValues = self.__IDF(dataset,self.__uniqueVocab) # calling function to generate the IDF values
return self.__uniqueVocab, __idfValues #returning unique words and theie idf values
else:
print("Input Error: This function only accepts list as input. Pass a list of words to continue...")
# IDF calulation method
def __IDF(self,dataset,vocab):
"""This method returns the IDF values for all unique words of a document corpus"""
__idfVal ={}
for w in vocab: # for unique words in vocabulary
count=0 # counter for storing the nummber of documents with term w in it
for data in dataset: # accessing each sentence in the corpus
if w in data.split(): # accessing each word of each sentence one by one
count+=1
__idfVal[w] = 1+(math.log((1+len(dataset))/(1+count))) # applying the IDF formula
return __idfVal # returning dictionary containing idf values for all unique words
# Transform method
def transform(self,dataset,vocab,idfValues):
"""This method will return the TF-IDF vectorizer for the documents in the corpus"""
if isinstance(dataset,list):
for idx, row in enumerate(tqdm(dataset)): # accessing each element in the document corpus and generating a dict of type {0:word}
word_freq = dict(Counter(row.split(" "))) # converting each row into a dict type o the form {word:freq}
for word, freq in word_freq.items():
if len(word)<2: # excluding words/stopwords which are mostly of lenght less than 2
continue
col_idx = vocab.get(word,-1) # this will return the column id from the vocab if the word exists else -1 would be returned
if col_idx != -1:
self.__columns.append(col_idx)
self.__rows.append(idx)
tfidf = (freq/len(row.split())*idfValues[word]) # tfidf calculation [tf*idf]
self.__values.append(tfidf)
#Storing in sparse matix
sparse_matrix = csr_matrix((self.__values, (self.__rows,self.__columns)), shape=(len(dataset),len(vocab)))
#creating a L2 normalized matirx to match-up with the sklearn result
normalizedMAtrix = normalize(sparse_matrix,norm='l2', axis=1, copy=True, return_norm=False)
return normalizedMAtrix
else:
print("Input Error: This function only accepts list as input. Pass a list of words to continue...")
Now I'm further trying to enhance this class such that the fit() function should return only the top 50 features with respect to the top 50 IDF values/score. I tried updating the fit() method with the below lines to get the top50 IDF scores and the associated features.
sortedIDF = sorted(__idfValues.items(),key=lambda item:item[1],reverse=True)
__top50idf = {}
for i,j in sortedIDF[:50]: # converting list of tuples to dict
__top50idf[i] = j
__top50Set = set()
for w in __top50idf.keys():
__top50Set.add(w)
__top50Vocab = {j:i for i,j in enumerate(__top50Set)}
['angelina', 'angela', 'angel', 'anatomist', 'amust', 'amusing', 'amazingly', 'amazed', 'amaze', 'amateurish', 'alongside', 'allowing', 'allow', 'allison', 'alike', 'alert', 'akin', 'akasha', 'aired', 'aimless', 'agreed', 'agree', 'ages', 'aged', 'afternoon', 'affleck', 'affected', 'aesthetically', 'adventure', 'adrift', 'admitted', 'admiration', 'admins', 'added', 'add', 'adams', 'actions', 'ackerman', 'achille', 'accurately', 'accurate', 'accolades', 'acclaimed', 'accessible', 'accents', 'academy', 'abstruse', 'abroad', 'abandoned', 'aailiyah']
50
[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]
I could achieve the top 50 features w.r.t top 50 IDF values which also consisted of duplicated IDF values. I wanted to understand if my approach is correct and if there's any better way to achieve the top n values from the tf-idf vectorizer.