Comparing Perplexities of different N-gram Models

Question

In my problem, I'm trying to compare the perplexity values of different N-gram models, say till N=4. However, I'm confused with the other results obtained using other methods. Here is my first implementation: -

    import nltk
    nltk.download('punkt')
    nltk.download('webtext')
    from nltk.corpus import webtext
    nltk.download('stopwords')
    from nltk.lm.preprocessing import padded_everygram_pipeline
    from nltk.lm import Laplace
    from nltk.lm import MLE
    from sklearn.model_selection import train_test_split
    from decimal import Decimal
    import numpy as np
    from nltk.util import ngrams
    
    corpus = []
    for fileid in webtext.fileids():
        corpus += [list(i) for i in webtext.sents(fileid)]
    
    from nltk.stem.porter import PorterStemmer
    from nltk.stem import WordNetLemmatizer 
    import re
    
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    
    def preprocess_sentence(text):
    
        text = text.lower()
    
        text = re.sub(r'[^\w\s]', '', text)
    
        text = re.sub(r'[0-9]', '', text)
    
        #remove unwanted spaces
        text = re.sub(' +', ' ', text)
    
        #remove trailing spaces 
        text = text.strip()
    
        #remove stop words
        f = []
    
        for w in text.split(" "):
            if w not in stopwords:
                f.append(w)
    
        text = " ".join(f)
    
        # text = text.split(" ")
        # text = [stemmer.stem(word) for word in text]
        # text = [lemmatizer.lemmatize(word) for word in text]
    
        return text
    data = []
    for sent in corpus:
        newstr = ' '.join(sent)
        newstr = preprocess_sentence(newstr)
        newlist = newstr.split(' ')
        data.append(newlist)
    
    **Bigram Model**
    n = 2
    train_data, padded_vocab = padded_everygram_pipeline(n, train)
    model_bi = Laplace(n)
    model_bi.fit(train_data, padded_vocab)
    
    generated_2grams = []
    
    for sent in test:
        generated_2grams.append(list(ngrams(sent, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
 
   
    PP_bi = []
    sum_sent_pp = 0
    for sent in generated_2grams:
        totalWords = len(sent)
        sent_pp = 1
        for bigram in sent:
            
            s1, s2 = bigram[0], bigram[1].split()
            score = model_bi.score(s1, s2) # score for each bigram
            if score != 0:
                sent_pp *= (1 / Decimal(score))
        sent_pp = pow(sent_pp, Decimal(1 / totalWords))
        PP_bi.append(sent_pp)
    print(f"Perplexity of a Bigram Model is {sum(PP_bi) / len(PP_bi)}")
    
    **Trigram Model**
    n = 3
    train_data, padded_vocab = padded_everygram_pipeline(n, train)
    model_tri = Laplace(n)
    model_tri.fit(train_data, padded_vocab)
   
    generated_3grams = []
    
    for sent in test:
        generated_3grams.append(list(ngrams(sent, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
    
 
    PP_tri = []
    
    
    for sent in generated_3grams:
        totalWords = len(sent)
        sent_pp = 1
        for trigram in sent:
            s1, s2, s3 = trigram[0], trigram[1], trigram[2]
            jointStr = s1 + " " + s2
            l = jointStr.split()
            score = model_tri.score(s3, l)
            
            if score != 0:
                sent_pp *= Decimal(1 / score) 
                
        sent_pp = pow(sent_pp, Decimal(1 / totalWords))
        
        PP_tri.append(sent_pp)
   print(f"Perplexity of a Trigram Model is {sum(PP_tri) / len(PP_tri)}")

**Quadgram Model**
n = 4
train_data, padded_vocab = padded_everygram_pipeline(n, train)
model_quad = Laplace(n)
generated_4grams = []

for sent in test:
    generated_4grams.append(list(ngrams(sent, 4, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))

PP_quad = []

for sent in generated_4grams:
    totalWords = len(sent)
    sent_pp = 1
    for quadgram in sent:
        s1, s2, s3, s4 = quadgram[0], quadgram[1], quadgram[2], quadgram[3]
        jointStr = s1 + " " + s2 + " " + s3
        l = jointStr.split()
        
        score = model_quad.score(s4, l)

        if score != 0:
            sent_pp *= Decimal(1 / score)
    sent_pp = pow(sent_pp, Decimal(1 / totalWords))
           
    PP_quad.append(sent_pp)
print(f"Perplexity of a Quadgram Model is {sum(PP_quad) / len(PP_quad)}")

Results:

1) Perplexity of Bigram : 12900.02

2) Perplexity of Trigram: 6241.26

3) Perplexity of Quadgram: 6804.64

The Perplexity should decrease in Quadgram model, however, it's more than the trigram one.

Moreover, I'm not so sure that the perplexity values I'm getting, and the method I have implemented is correct.

I also tried finding perplexites usin nltk.perplexity(), however, I'm getting different results. The input to nltk.perplexity is bigrams of a sentence in a corpus.

With both the approaches giving different results, I'm quite confused.

That is normal for different models. Actually they are pretty similar and of couse 2 grams is the worse. — Brana, Jan 30 '23 at 06:54

Comparing Perplexities of different N-gram Models

0 Answers0