I apologise in advance for posting so much code.
I am trying to classify YouTube comments into ones that contain opinion (be it positive or negative) and ones that don't using NLTK's Naive Bayes classifier, but no matter what I do during the preprocessing stage I can't really get the accuracy above 0.75. This seems kinda low compared to other examples I have seen - this tutorial ends up with an accuracy of around 0.98 for example.
Here is my full code
import nltk, re, json, random
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist, classify, NaiveBayesClassifier
from contractions import CONTRACTION_MAP
from abbreviations import abbrev_map
from tqdm.notebook import tqdm
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
text = re.sub(r"’", "'", text)
if text in abbrev_map:
return(abbrev_map[text])
text = re.sub(r"\bluv", "lov", text)
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
return expanded_text
def reduce_lengthening(text):
pattern = re.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1", text)
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = []
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
return lemmatized_sentence
def processor(comments_list):
new_comments_list = []
for com in tqdm(comments_list):
com = com.lower()
#expand out contractions
tok = com.split(" ")
z = []
for w in tok:
ex_w = expand_contractions(w)
z.append(ex_w)
st = " ".join(z)
tokenized = tokenizer.tokenize(st)
reduced = [reduce_lengthening(token) for token in tokenized]
new_comments_list.append(reduced)
lemmatized = [lemmatize_sentence(new_com) for new_com in new_comments_list]
return(lemmatized)
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def get_comments_for_model(cleaned_tokens_list):
for comment_tokens in cleaned_tokens_list:
yield dict([token, True] for token in comment_tokens)
if __name__ == "__main__":
#=================================================================================~
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
with open ("english_lang/samples/training_set.json", "r", encoding="utf8") as f:
train_data = json.load(f)
pos_processed = processor(train_data['pos'])
neg_processed = processor(train_data['neg'])
neu_processed = processor(train_data['neu'])
emotion = pos_processed + neg_processed
random.shuffle(emotion)
em_tokens_for_model = get_comments_for_model(emotion)
neu_tokens_for_model = get_comments_for_model(neu_processed)
em_dataset = [(comment_dict, "Emotion")
for comment_dict in em_tokens_for_model]
neu_dataset = [(comment_dict, "Neutral")
for comment_dict in neu_tokens_for_model]
dataset = em_dataset + neu_dataset
random.shuffle(dataset)
x = 700
tr_data = dataset[:x]
te_data = dataset[x:]
classifier = NaiveBayesClassifier.train(tr_data)
print(classify.accuracy(classifier, te_data))
I can post my training data set if needed, but it's probably worth mentioning that the quality of English is very poor and inconsistent in the YouTube comments themselves (which I imagine is the reason for the low model accuracy). In any case, would this be considered an acceptable level of accuracy? Alternatively, I may well be going about this all wrong and there is a far superior model to be using, in which case feel free to tell me I am an idiot! Thanks in advance