I was trying to implement a naïve Bayes model to classify messages. The result of my codes has only three different predicted classes, while there are 20 classes in the training examples. I'm wondering if I did it correctly? Can you see if I did something wrong from my codes?
Here are my steps:
1)From the training messages, I get a dictionary of unique words with their frequencies by removing all punctuations, stop words, and change all capitalized letters to small letters.
2)Then I selected a list of 1000 most frequently appeared words to be my featured words. (I think this step is not correct, I feel like I also need to look into the frequency of how many different messages each word appears into).
3)Then for each message, I made them into a vector of frequencies of words according to the featured word list.
4)Then I compute the priors, which are the proportion of the number of messages of each class over the total number of messages.
5)Then I create a table of conditional probabilities p(wi|cj) to be the probability of having the featured word wj given it is from class cj. I computed it by the formula
(1+numberOfOccurenceOfWiinCj)/(totalNumberOfWordsInCj+numberOfFeatureWords).
I read that we can use inverse document frequency for a term t which is log(nd/nd(t)) where nd is total number of texts and nd(t) is the number of text containing the term t. My question here is how do we use inverse document frequency in the codes?
6)Then for each training message, I compute the probability that it is in a given class cj by first vectorize the message and do
log(prior_of_cj)+log(1+number_of_times_w1_appears)log(p(wi|cj))+...+log(1+number_of_times_w1000_appears)log(p(wi|cj)).
7)Then I pick the class with the maximum computed value to be my prediction.
import numpy as np
import string
import re
import pickle
import csv
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.text import Text
from nltk.stem import WordNetLemmatizer
import itertools
import operator
import pandas as pd
#imports end
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
# for filename in filenames:
# print(os.path.join(dirname, filename))
#data load
with open('/content/drive/My Drive/Colab Notebooks/data_train.pkl','rb') as f:
train=pickle.load(f)
with open('/content/drive/My Drive/Colab Notebooks/data_test.pkl','rb') as f:
test=pickle.load(f)
#data_list=train[0]
#y_train=train[1]
original_train= train[0][0:1000]
data_list= original_train
originaly_train=train[1][0:1000]
y_train=originaly_train
test_sample=test[0:100]
# # preprocess Start
def preProcess(content):
content=content.lower() #to lower case
content=re.sub(r'\d+', '', content) #remove digits
content=content.translate(str.maketrans('', '', string.punctuation))#remove puctuations
content=content.strip()#remove extra space
return re.sub("\s\s+", " ", content)
# make a string of concatenation of all strings.
concatAllString= ""
for index,temp in enumerate(data_list):
data_list[index]=preProcess(temp)
concatAllString+=" "
concatAllString+=data_list[index]
#get a sorted dictionary of frequencies of each unique words.
words = nltk.tokenize.word_tokenize(concatAllString)
fdist1 = nltk.FreqDist(words)
all_words_dict = dict((word, freq) for word, freq in fdist1.items() if not word.isdigit())
all_words_dict=dict(sorted(all_words_dict.items(), key=lambda x: x[1], reverse=True))
#Step4: Filter out all stop words from the dictionary and words of length greater than 6 and less than 2
stop_words = set(stopwords.words('english'))
all_words_dict={k:v for k, v in all_words_dict.items() if k not in stop_words and len(k)<=6 and len(k)>1}
#Step5: get 1000 feature words with highest frequencies from the all_words_dictionary
feature_words = dict(itertools.islice(all_words_dict.items(), 1000))
#Step6: create vectorized version of messages in the training set
def vectorizeData(str):
c = Counter(preProcess(str).split())
return [c[i] for i in feature_words]
vectorized_training=[[]]*len(original_train)
for index,temp in enumerate(original_train):
vectorized_training[index]= vectorizeData(temp)
#Step 6: Compute prior table
uniquelabels = []
for i in y_train:
if not i in uniquelabels:
uniquelabels.append(i)
fdist2 = nltk.FreqDist(y_train)
priors = dict((word, freq) for word, freq in fdist2.items() if not word.isdigit())
for key, value in priors.items():
priors[key] = float(value)/len(y_train)
#Step7 : compute conditional probabilities
smoothingParameter=1.0
tableOfConditionals=np.zeros((len(priors), len(feature_words)))
for i in range(len(priors)):
tempDatas= np.array(vectorized_training)[np.array(y_train)==list(priors.keys())[i],:]
tempDatas= np.sum(tempDatas,axis=0)
for j in range(len(feature_words)):
tableOfConditionals[i,j]=(tempDatas[j]+smoothingParameter)/(np.sum(tempDatas)+smoothingParameter*len(feature_words))
#Step8: Compute probability of being in each class for a given example
def computeLabel(ex):
labelProbabilities= dict.fromkeys(priors, 0)
vectorTemp=vectorizeData(ex)
indexi=0
for key, value in priors.items():
result=0
for index in vectorTemp:
result=result+np.log(tableOfConditionals[indexi,index])*np.log(vectorTemp[index]+1)
labelProbabilities[key]=result+np.log(value)
indexi=indexi+1
return max(labelProbabilities.items(), key=operator.itemgetter(1))[0]
predictions = ['H'] * len(test_sample)
for i in range(len(test_sample)):
predictions[i]=computeLabel(test_sample[i])
print(predictions)
# with open('/content/drive/My Drive/Colab Notebooks/randompredict.csv','w') as filehandle:
# filehandle.writelines("%s\n" % predict for predict in predictions)
#Final step: Compute probability of each example in the testing and store in submission file
print("Your submission was successfully saved!")
Please help, I have been working on it for 3 days... I think this has to do which how I choose the feature words. I only choose those that appears most frequently so that is probably not a good way to capture the importance of a word in a document