Ascii codec can't decode byte 0xc2 python nltk

Question

I have a code that I'm using for Spam Classification and it works great but everytime I try to stem/lemmatize the word I get this error:

File "/Users/Ramit/Desktop/Bayes1/src/filter.py", line 16, in trim_word word = ps.stem(word)

File "/Library/Python/2.7/site-packages/nltk/stem/porter.py", line 664, in stem stem = self._step1a(stem)

File "/Library/Python/2.7/site-packages/nltk/stem/porter.py", line 289, in _step1a

if word.endswith('ies') and len(word) == 4:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)

Here is my code:

    from word import Word
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    class Filter():

def __init__(self):
    self.words = dict()


def trim_word(self, word):
    # Helper method to trim away some of the non-alphabetic characters
    # I deliberately do not remove all non-alphabetic characters.
    word = word.strip(' .:,-!()"?+<>*')
    word = word.lower()
            word = ps.stem(word)
    return word


def train(self, train_file):
    lineNumber = 1
    ham_words = 0
    spam_words = 0
            stop = set(stopwords.words('english'))

    # Loop through all the lines
    for line in train_file:
        if lineNumber % 2 != 0:
            line = line.split('\t')
            category = line[0]
            input_words = line[1].strip().split(' ')

            #Loop through all the words in the line, remove some characters
            for input_word in input_words:
                input_word = self.trim_word(input_word)
                if (input_word != "") and (input_word not in stop):

                    # Check if word is in dicionary, else add
                    if input_word in self.words:
                        word = self.words[input_word]
                    else:
                        word = Word(input_word)
                        self.words[input_word] = word

                    # Check wether the word is in ham or spam sentence, increment counters
                    if category == "ham":
                        word.increment_ham()
                        ham_words += 1
                    elif category == "spam":
                        word.increment_spam()
                        spam_words += 1

                    # Probably bad training file input...
                    else:
                        print "Not valid training file format"

        lineNumber+=1

    # Compute the probability for each word in the training set
    for word in self.words:
        self.words[word].compute_probability(ham_words, spam_words)


def get_interesting_words(self, sms):
    interesting_words = []
            stop = set(stopwords.words('english'))
    # Go through all words in the SMS and append to list. 
    # If we have not seen the word in training, assign probability of 0.4
    for input_word in sms.split(' '):
        input_word = self.trim_word(input_word)
        if (input_word != "") and (input_word not in stop):
            if input_word in self.words:
                word = self.words[input_word]
            else:
                word = Word(input_word)
                word.set_probability(0.40)
            interesting_words.append(word)

    # Sort the list of interesting words, return top 15 elements if list is longer than 15
    interesting_words.sort(key=lambda word: word.interesting(), reverse=True)
    return interesting_words[0:15]


def filter(self, input_file, result_file):
    # Loop through all SMSes and compute total spam probability of the sms-message
    lineNumber = 0
    for sms in input_file:
        lineNumber+=1
        spam_product = 1.0
        ham_product = 1.0
        if lineNumber % 2 != 0:
            try:
                for word in self.get_interesting_words(sms):
                    spam_product *= word.get_probability()
                    ham_product *= (1.0 - word.get_probability())

                sms_spam_probability = spam_product / (spam_product + ham_product)
            except:
                result_file.write("error")

            if sms_spam_probability > 0.8:
                result_file.write("SPAM: "+sms)
            else:
                result_file.write("HAM: "+sms)
        result_file.write("\n")

I'm just looking for a solution that would allow me to lemmatize/stem the words. I tried looking around the net I did find similar problems, but they haven't been working for me.

Suggestions: (1) Convert your tabs to spaces before posting. (2) Create a [minimal example](http://stackoverflow.com/help/mcve). — Tom Zych, Mar 18 '17 at 12:40
Maybe this would help https://gist.github.com/alvations/07758d02412d928414bb from https://github.com/alvations/pywsd/blob/master/pywsd/utils.py#L66 — alvas, Mar 18 '17 at 13:57
The problem might be that you're not reading the file correctly? try `import io; file_in = io.open('filename.txt', 'r', encoding='utf8')`. It's a little unclear what is wrong but if you could post the data you're trying to process, it'll be much easier to understand what went wrong. — alvas, Mar 18 '17 at 13:59

MFigueredo · Answer 1 · 2017-03-18T21:22:39.750

0

Use sys.

import sys
sys.setdefaultencoding('utf-8')
reload(sys)

edited Mar 18 '17 at 21:22

answered Mar 18 '17 at 14:50

MFigueredo

133
8

Ascii codec can't decode byte 0xc2 python nltk

1 Answers1