In my problem, I'm trying to compare the perplexity values of different N-gram models, say till N=4. However, I'm confused with the other results obtained using other methods. Here is my first implementation: -
import nltk
nltk.download('punkt')
nltk.download('webtext')
from nltk.corpus import webtext
nltk.download('stopwords')
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import Laplace
from nltk.lm import MLE
from sklearn.model_selection import train_test_split
from decimal import Decimal
import numpy as np
from nltk.util import ngrams
corpus = []
for fileid in webtext.fileids():
corpus += [list(i) for i in webtext.sents(fileid)]
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def preprocess_sentence(text):
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'[0-9]', '', text)
#remove unwanted spaces
text = re.sub(' +', ' ', text)
#remove trailing spaces
text = text.strip()
#remove stop words
f = []
for w in text.split(" "):
if w not in stopwords:
f.append(w)
text = " ".join(f)
# text = text.split(" ")
# text = [stemmer.stem(word) for word in text]
# text = [lemmatizer.lemmatize(word) for word in text]
return text
data = []
for sent in corpus:
newstr = ' '.join(sent)
newstr = preprocess_sentence(newstr)
newlist = newstr.split(' ')
data.append(newlist)
**Bigram Model**
n = 2
train_data, padded_vocab = padded_everygram_pipeline(n, train)
model_bi = Laplace(n)
model_bi.fit(train_data, padded_vocab)
generated_2grams = []
for sent in test:
generated_2grams.append(list(ngrams(sent, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
PP_bi = []
sum_sent_pp = 0
for sent in generated_2grams:
totalWords = len(sent)
sent_pp = 1
for bigram in sent:
s1, s2 = bigram[0], bigram[1].split()
score = model_bi.score(s1, s2) # score for each bigram
if score != 0:
sent_pp *= (1 / Decimal(score))
sent_pp = pow(sent_pp, Decimal(1 / totalWords))
PP_bi.append(sent_pp)
print(f"Perplexity of a Bigram Model is {sum(PP_bi) / len(PP_bi)}")
**Trigram Model**
n = 3
train_data, padded_vocab = padded_everygram_pipeline(n, train)
model_tri = Laplace(n)
model_tri.fit(train_data, padded_vocab)
generated_3grams = []
for sent in test:
generated_3grams.append(list(ngrams(sent, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
PP_tri = []
for sent in generated_3grams:
totalWords = len(sent)
sent_pp = 1
for trigram in sent:
s1, s2, s3 = trigram[0], trigram[1], trigram[2]
jointStr = s1 + " " + s2
l = jointStr.split()
score = model_tri.score(s3, l)
if score != 0:
sent_pp *= Decimal(1 / score)
sent_pp = pow(sent_pp, Decimal(1 / totalWords))
PP_tri.append(sent_pp)
print(f"Perplexity of a Trigram Model is {sum(PP_tri) / len(PP_tri)}")
**Quadgram Model**
n = 4
train_data, padded_vocab = padded_everygram_pipeline(n, train)
model_quad = Laplace(n)
generated_4grams = []
for sent in test:
generated_4grams.append(list(ngrams(sent, 4, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
PP_quad = []
for sent in generated_4grams:
totalWords = len(sent)
sent_pp = 1
for quadgram in sent:
s1, s2, s3, s4 = quadgram[0], quadgram[1], quadgram[2], quadgram[3]
jointStr = s1 + " " + s2 + " " + s3
l = jointStr.split()
score = model_quad.score(s4, l)
if score != 0:
sent_pp *= Decimal(1 / score)
sent_pp = pow(sent_pp, Decimal(1 / totalWords))
PP_quad.append(sent_pp)
print(f"Perplexity of a Quadgram Model is {sum(PP_quad) / len(PP_quad)}")
Results:
1) Perplexity of Bigram : 12900.02
2) Perplexity of Trigram: 6241.26
3) Perplexity of Quadgram: 6804.64
The Perplexity should decrease in Quadgram model, however, it's more than the trigram one.
Moreover, I'm not so sure that the perplexity values I'm getting, and the method I have implemented is correct.
I also tried finding perplexites usin nltk.perplexity(), however, I'm getting different results. The input to nltk.perplexity is bigrams of a sentence in a corpus.
With both the approaches giving different results, I'm quite confused.