I was trying to use a function read txt file and tokenize by words including tokenizing, removing white spaces, stemming, collecting word counts, removing stop words but there's something wrong with the stemming since some of "s"s and "r"s was swallowed by the program. Also which part is appropriate to insert the word counts?
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize #split variable into words
from nltk.corpus import stopwords #stopwords
from nltk.stem import PorterStemmer #stem tools
from collections import defaultdict
#1)
def tokenizers(filename):
#Read files
file = open(filename, "r", encoding = "utf")
lines = file.readline()
file.close()
#Set stop words and symbols (define symbols)
stopWords = set(stopwords.words("english"))
stopWords = stopWords.union(',','(",")','[","]','{","}','#','@','!',':',';','.','?')
#Tokenize paragrah into words
sentences = word_tokenize(lines)
#Stem words, remove "s"
ps = PorterStemmer()
filterWords = [ps.stem(w) for w in sentences if not w in stopWords]
return filterWords