I am trying to attempt the mapreduce pairs pattern in python. Need to check if a word is in a text file and then find the word next to it and yield a pair of both words. keep running into either:
neighbors = words[words.index(w) + 1]
ValueError: substring not found
or
ValueError: ("the") is not in list
file cwork_trials.py
from mrjob.job import MRJob
class MRCountest(MRJob):
# Word count
def mapper(self, _, document):
# Assume document is a list of words.
#words = []
words = document.strip()
w = "the"
neighbors = words.index(w)
for word in words:
#searchword = "the"
#wor.append(str(word))
#neighbors = words[words.index(w) + 1]
yield(w,1)
def reducer(self, w, values):
yield(w,sum(values))
if __name__ == '__main__':
MRCountest.run()
Edit: Trying to use the pairs pattern to search a document for every instance of a specific word and then find the word next to it each time. Then yielding a pair result for each instance i.e. find instances of "the" and the word next to it i.e. [the], [book], [the], [cat] etc.
from mrjob.job import MRJob
class MRCountest(MRJob):
# Word count
def mapper(self, _, document):
# Assume document is a list of words.
#words = []
words = document.split(" ")
want = "the"
for w, want in enumerate(words, 1):
if (w+1) < len(words):
neighbors = words[w + 1]
pair = (want, neighbors)
for u in neighbors:
if want is "the":
#pair = (want, neighbors)
yield(pair),1
#neighbors = words.index(w)
#for word in words:
#searchword = "the"
#wor.append(str(word))
#neighbors = words[words.index(w) + 1]
#yield(w,1)
#def reducer(self, w, values):
#yield(w,sum(values))
if __name__ == '__main__':
MRCountest.run()
As it stands I get yields of every word pair with multiples of the same pairing.