import math, functools
def splitPairs(word):
return [(word[:i+1], word[i+1:]) for i in range(len(word))]
def segment(word):
if not word: return []
allSegmentations = [[first] + segment(rest)
for (first, rest) in splitPairs(word)]
return max(allSegmentations, key = wordSegFitness)
class OneGramDist(dict):
def __init__(self):
self.gramCount = 0
for line in open('Norvig Word Library.txt'):
(word, count) = line[:-1].split('\t')
self[word] = int(count)
self.gramCount += self[word]
def __call__(self, word):
if word in self:
return float(self[word]) / self.gramCount
else:
return 1.0/ self.gramCount
singleWordProb = OneGramDist()
def wordSegFitness(words):
return functools.reduce(lambda x,y: x+y),
(math.log10(singleWordProblem(w)) for w in words)
I'm trying to improve the word segmentation of some text files I have. Some of the words in these files are joined (e.g. 'howmuchdoesthecarcost
or 'helloworld'
), and I'm running a Naive Bayesian process to segment the words apart.
However, when I run something like segment("helloworld")
, I get the following error: TypeError: reduce expected at least 2 arguments, got 1
. How can I change the arguments in reduce without losing the efficacy of wordSegFitness()
?