I am trying to make a job that takes in a text file, only processes words that are not in the STOPWORDS set, counts the number of syllables in each word, then returns the top 10 words with the most syllables, sorting the results.
I believe everything is correct, I just am not sure how to make the reducer sort the results.
Here is my code:
%%file top_10_syllable_count.py
import re
from sys import stderr
from mrjob.job import MRJob
from mrjob.step import MRStep
WORD_RE = re.compile(r"[\w']+")
import syllables
def splitter(text):
WORD_RE = re.compile(r"[\w']+")
return WORD_RE.findall(text)
def sort_results(results):
"""
Sorts a list of 2-tuples descending by the first value in the
tuple, ascending by the second value in the tuple.
"""
return sorted(results, key=lambda k: (-k[0], k[1]))
STOPWORDS = {
'i', 'we', 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during',
'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such',
'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each',
'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me',
'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up',
'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been',
'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so',
'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself',
'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by',
'doing', 'it', 'how', 'further', 'was', 'here', 'than'
}
class MRMostTenSyllables(MRJob):
def steps(self):
return[
MRStep(mapper=self.mapper_get_words),
MRStep(reducer=self.reducer_find_max_word)
]
def mapper_get_words(self, _, line):
for word in WORD_RE.findall(line):
if word.lower() not in STOPWORDS:
syllable_count = syllables.estimate(word)
yield None, (syllable_count, word.lower())
def reducer_find_max_word(self, key, values):
self.list = []
for value in values:
self.list.append(value)
self.new = []
for i in range(10):
self.new.append(max(self.list))
self.list.remove(max(self.list))
for i in range(10):
yield self.new[i]
if __name__ == '__main__':
import time
start = time.time()
MRMostTenSyllables.run()
end = time.time()
debug("Run time:", end - start, "seconds")