Implementing tf-idf from scratch to return top n features w.r.t top n IDF values

Question

I'm currently learning TF-IDF vectorizer and was trying to implement TD-IDF from scratch for better understanding. I created the below class for implementing TF-IDF vectorizer.

The below class basically has two major methods. the fit() method finds out he unique words from the document corpus and calculates the associated IDF values for these elements. The transform() method generates the td-idf vectorizer for the documents in the corpus.

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np


class TFIDFVectoriser_:
    
    # class initialisation
    def __init__(self): 
        """This is the initialisation method of the TFIDFVectoriser class"""
        self.__uniqueSet = set() # Set to store the unique words form the document corpus
        self.__uniqueVocab = {} # Dict to store the unique words form the document corpus as keys and column nunber as values
        self.__rows = [] # For storing index of the document 
        self.__columns =[] # For storing the dimensions of the word
        self.__values = [] # For storing the freqeny of the word
    
    
    #Fit method
    def fit(self,dataset):
        """This method will return the set of unique words from the input list and will map them with a column number"""
        if isinstance(dataset,list):
            for data in tqdm(dataset): # accessing each row in the dataset
                for w in data.split(" "): # accessing each word in each row
                    if len(w) < 2: # excluding words/stopwords which are mostly of lenght less than 2
                        continue
                    self.__uniqueSet.add(w)
            self.__uniqueSet = sorted(list(self.__uniqueSet)) # storing the final set of unique words in a srted list
            self.__uniqueVocab = {j:i for i,j in enumerate(self.__uniqueSet)} # creating a dict() of unique elements and column numbers           
            __idfValues = self.__IDF(dataset,self.__uniqueVocab) # calling function to generate the IDF values
            
            return self.__uniqueVocab, __idfValues #returning unique words and theie idf values 
        else:
            print("Input Error: This function only accepts list as input. Pass a list of words to continue...")
    
    # IDF calulation method
    def __IDF(self,dataset,vocab):
        """This method returns the IDF values for all unique words of a document corpus"""
        __idfVal ={} 
        for w in vocab: # for unique words in vocabulary
            count=0 # counter for storing the nummber of documents with term w in it
            for data in dataset: # accessing each sentence in the corpus
                if w in data.split(): # accessing each word of each sentence one by one
                    count+=1
                __idfVal[w] = 1+(math.log((1+len(dataset))/(1+count))) # applying the IDF formula
        return __idfVal # returning dictionary containing idf values for all unique words
    
    # Transform method
    def transform(self,dataset,vocab,idfValues):
        """This method will return the TF-IDF vectorizer for the documents in the corpus"""
        if isinstance(dataset,list):
            for idx, row in enumerate(tqdm(dataset)): # accessing each element in the document corpus and generating a dict of type {0:word}
                word_freq = dict(Counter(row.split(" "))) # converting each row into a dict type o the form {word:freq}
                for word, freq in word_freq.items():
                    if len(word)<2: # excluding words/stopwords which are mostly of lenght less than 2 
                        continue
                    col_idx = vocab.get(word,-1) # this will return the column id from the vocab if the word exists else -1 would be returned
                    if col_idx != -1: 
                        self.__columns.append(col_idx)
                        self.__rows.append(idx)
                        tfidf = (freq/len(row.split())*idfValues[word]) # tfidf calculation [tf*idf]
                        self.__values.append(tfidf)
            #Storing in sparse matix
            sparse_matrix = csr_matrix((self.__values, (self.__rows,self.__columns)), shape=(len(dataset),len(vocab)))
            #creating a L2 normalized matirx to match-up with the sklearn result
            normalizedMAtrix = normalize(sparse_matrix,norm='l2', axis=1, copy=True, return_norm=False)
            return normalizedMAtrix
        else:
            print("Input Error: This function only accepts list as input. Pass a list of words to continue...")

Now I'm further trying to enhance this class such that the fit() function should return only the top 50 features with respect to the top 50 IDF values/score. I tried updating the fit() method with the below lines to get the top50 IDF scores and the associated features.

            sortedIDF = sorted(__idfValues.items(),key=lambda item:item[1],reverse=True)
            __top50idf = {}
            for i,j in sortedIDF[:50]: # converting list of tuples to dict
                __top50idf[i] = j
                
            __top50Set = set()
            for w in __top50idf.keys():
                __top50Set.add(w)
            __top50Vocab = {j:i for i,j in enumerate(__top50Set)}

['angelina', 'angela', 'angel', 'anatomist', 'amust', 'amusing', 'amazingly', 'amazed', 'amaze', 'amateurish', 'alongside', 'allowing', 'allow', 'allison', 'alike', 'alert', 'akin', 'akasha', 'aired', 'aimless', 'agreed', 'agree', 'ages', 'aged', 'afternoon', 'affleck', 'affected', 'aesthetically', 'adventure', 'adrift', 'admitted', 'admiration', 'admins', 'added', 'add', 'adams', 'actions', 'ackerman', 'achille', 'accurately', 'accurate', 'accolades', 'acclaimed', 'accessible', 'accents', 'academy', 'abstruse', 'abroad', 'abandoned', 'aailiyah']
50
[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]

I could achieve the top 50 features w.r.t top 50 IDF values which also consisted of duplicated IDF values. I wanted to understand if my approach is correct and if there's any better way to achieve the top n values from the tf-idf vectorizer.

score 0 · Answer 1 · edited Apr 12 '22 at 13:23

def tranform(corpus,idfs):
    rows=[]
    colums=[]
    values=[]
    tfidf={}
    if isinstance(corpus, (list,)):
        for idx, row in enumerate(tqdm(corpus)):
            lis=[]
            lis.append(row)             
            tfval=termfreq(lis,vocab50)            
        for word,value in idf_values.items():
             if len(word) < 2:
                    continue
                tfidf[word]=idf_values[word]*tfval[word]
                
              if tfidf[word]!=0:
                    rows.append(idx)
                    colums.append(value)
                    values.append(tfidf[word])
        l=csr_matrix((values, (rows,colums)), sh(len(corpus),len(vocab50)))                
        k=normalize(l,norm='l2',) # normalize the sparse matrix(l2 norm)
        print(k[0])
        print("_______________________________________________________")
        print(k[0].toarray()) #converting intro dense matrix
        print("shape of sparse matrix",k.shape)

Welcome to stackoverflow! please do not just answer by copy pasting a chunk of code in a code block, instead: put text around it to formulate the question and describe what the problem was and how this solves it — Luke_, Apr 12 '22 at 14:54

score 0 · Answer 2 · edited Aug 11 '22 at 14:49

0

    """FUNCTION OF CALCULATE IDF VALUES OF A CORPUS"""
def idf(corpus,vocab):

    idf={}
    N=len(corpus)

    for act in vocab.keys():
        n=0
        for idx, row in enumerate((corpus)):
            if len(act) < 2:
                continue
            if act in row:
                n=n+1
             
        idf[act]=1+(m.log((1+N)/(1+float(n))))      
    return(idf)

edited Aug 11 '22 at 14:49

Adrian Mole

49,934
160
51
83

answered Aug 05 '22 at 14:01

suman

1
1

2

Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). – Community Aug 07 '22 at 12:23

Implementing tf-idf from scratch to return top n features w.r.t top n IDF values

2 Answers2