I am trying to pickle a dictionary of the form {word : {docId : int}}. My code is below:
def vocabProcess(documents):
word_splitter = re.compile(r"\w+", re.VERBOSE)
stemmer=PorterStemmer()#
stop_words = set(stopwords.words('english'))
wordDict = {}
for docId in documents:
processedDoc = [stemmer.stem(w.lower()) for w in
word_splitter.findall(reuters.raw(docId)) if not w in stop_words]
for w in processedDoc:
if w not in wordDict:
wordDict[w] = {docId : processedDoc.count(w)}
else:
wordDict[w][docId] = processedDoc.count(w)
with open("vocabListings.txt", "wb") as f:
_pickle.dump(wordDict, f)
if __name__ == "__main__":
documents = reuters.fileids()
with open("vocabListings.txt", "r") as f:
vocabulary = _pickle.load(f)
When I run this code, I get the error
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 2399:
character maps to <undefined>
Why is this breaking when none of the reuters docs/docids have unicode in them? How do I fix this so that I can still use the _pickle module?