1

I have a toy corpus:

ds0 = 'the cat sat on my face'
ds1 = 'the dog sat on my bed'

and have written 2 programs to compute tf-idf, one is using the base formulas and the other uses vectorizer from scikit-learn. The final scores differ across the 2 programs and I am not sure how to interpret the results.

PROGRAM 1

# 1-26-2022: this code mimics tfidf and is freely based on https://www.youtube.com/watch?app=desktop&v=hXNbFNCgPfY
# calculate tf
# number of times the word appears in a doc / total number of word in the doc
def calctf(LI, D):
        OUTDIC ={}
        L = len(LI)
        for key, value in D.items():
                OUTDIC[key] = value/L
        return(OUTDIC)
# calculate idf
# log(number_of_documents / Number of documents that contain a word)
def calcidf(L0, L1,DOC0, DOC1,N=2):
        OUTDICT={}
#
        s0=  set(L0)
        s1=  set(L1)
        s =  set(L0).union(set(L1))
        OUTDICT  = dict.fromkeys(s,  0)
#
        L = len(OUTDICT)
        for doc in [DOC0,DOC1]:
                for word,value in doc.items():
                        if  doc[word] > 0:
#                               OUTDICT[word] = value+1
                                OUTDICT[word] = OUTDICT[word] + 1
        for key, value in OUTDICT.items():
                OUTDICT[key] = math.log(N / value)
        return(OUTDICT)
#
def computeTFIDF(tf,  idf):
        tfidf = {}
        for word, value in tf.items():
                tfidf[word] = value * idf[word]
        return tfidf
import pandas as pd
import math
ds0 = 'the cat sat on my face'
ds1 = 'the dog sat on my bed'
#
# list ---> the simplest tokenization
#
l0 = ds0.split()
l1 = ds1.split()
#
# set --> remove redundancies
#
s0=  set(l0)
s1=  set(l1)
s =  set(l0).union(set(l1))
print('union set s ',s)
#
# dictionary ---> required for lookup
#
d0  = dict.fromkeys(s,  0)
d1  = dict.fromkeys(s,  0)
#
print('-------------------------')
print('dict d0 ',d0)
print('dict d1 ',d1)
#
#for key, value in d1.items():
#       print(key, value)
for k in l0:
        if k in d0:
                d0[k] = d0[k] + 1
#
for k in l1:
        if k in d1:
                d1[k] = d1[k] + 1
print('-------------------------')
print('dict d0 after processing ',d0)
print('-------------------------')
print('dict d1 after processing ',d1)
X = pd.DataFrame([d0,d1])
print(X)
# calculate tf
#
tf0= calctf(l0, d0)
tf1= calctf(l1, d1)
OUT= calcidf(l0, l1, d0, d1,N=2)
#
#print(OUT)
tfidf0 = computeTFIDF(tf0, OUT)
tfidf1 = computeTFIDF(tf1, OUT)
X = pd.DataFrame([tfidf0,tfidf1])
print(X)

OUTPUT OF PROGRAM 1

       face  sat       dog   my       cat       bed   on  the
0  0.115525  0.0  0.000000  0.0  0.115525  0.000000  0.0  0.0
1  0.000000  0.0  0.115525  0.0  0.000000  0.115525  0.0  0.0

PROGRAM 2

# 1-31-2022: this code uses the same corpus as term_frequency_sklearn_TEST.py
# and uses scikit learn
# reference: https://www.youtube.com/watch?app=desktop&v=BJ0MnawUpaU
#
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import math
ds0 = 'the cat sat on my face'
ds1 = 'the dog sat on my bed'
stopset = set(stopwords.words('english'))
corpus = [ds0, ds1]
print('corpus ',corpus)
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True)
X = vectorizer.fit_transform(corpus)
first = X[0]
second = X[1]
Z0 = pd.DataFrame(first.T.todense(),index=vectorizer.get_feature_names())
Z1 = pd.DataFrame(second.T.todense(),index=vectorizer.get_feature_names())
print(Z0)
print(Z1)

OUTPUT OF PROGRAM 2

             0
bed   0.000000
cat   0.631667
dog   0.000000
face  0.631667
sat   0.449436
             0
bed   0.631667
cat   0.000000
dog   0.631667
face  0.000000
sat   0.449436

While the 2 programs differentiate the documents, the scores differ, and the word 'sat' which is probably not a stopword is assigned a different value.

Chris
  • 18,724
  • 6
  • 46
  • 80
LP0
  • 107
  • 1
  • 7
  • Please have a look at this [related answer](https://stackoverflow.com/a/70757379/17865804) as well. – Chris Nov 15 '22 at 06:46

0 Answers0