Borrowed from Peter Norvig's pytudes to perform word segmentation.
Please try..
import re
import math
import random
import matplotlib.pyplot as plt
from collections import Counter
from itertools import permutations
from typing import List, Tuple, Set, Dict, Callable
!wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
Word = str # We implement words as strings
cat = ''.join # Function to concatenate strings together
def tokens(text) -> List[Word]:
"""List all the word tokens (consecutive letters) in a text. Normalize to lowercase."""
return re.findall('[a-z]+', text.lower())
TEXT = open('big.txt').read()
WORDS = tokens(TEXT)
class ProbabilityFunction:
def __call__(self, outcome):
"""The probability of `outcome`."""
if not hasattr(self, 'total'):
self.total = sum(self.values())
return self[outcome] / self.total
class Bag(Counter, ProbabilityFunction): """A bag of words."""
Pword = Bag(WORDS)
def Pwords(words: List[Word]) -> float:
"Probability of a sequence of words, assuming each word is independent of others."
return Π(Pword(w) for w in words)
def Π(nums) -> float:
"Multiply the numbers together. (Like `sum`, but with multiplication.)"
result = 1
for num in nums:
result *= num
return result
def splits(text, start=0, end=20) -> Tuple[str, str]:
"""Return a list of all (first, rest) pairs; start <= len(first) <= L."""
return [(text[:i], text[i:])
for i in range(start, min(len(text), end)+1)]
def segment(text) -> List[Word]:
"""Return a list of words that is the most probable segmentation of text."""
if not text:
return []
else:
candidates = ([first] + segment(rest)
for (first, rest) in splits(text, 1))
return max(candidates, key=Pwords)
strings = ['thatCreation', 'happeningso', 'comebecause']
[segment(string.lower()) for string in strings]
--2020-08-04 18:48:06-- https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4863005 (4.6M) [text/plain]
Saving to: ‘words.txt.2’
words.txt.2 100%[===================>] 4.64M 162KB/s in 25s
2020-08-04 18:48:31 (192 KB/s) - ‘words.txt.2’ saved [4863005/4863005]
[['that', 'creation'], ['happening', 'so'], ['come', 'because']]