Python pattern matching of rules to parse text to phonemes

Question

I have a set of rules that can be used to convert text to a set of phonemes. The application of these rules would result in conversions such as the following:

a            uh
ability      ae-b-ih-l-ih-t-ee
aboard       uh-b-oh-r-d
abort        uh-b-oh-r-t
affirmative  ah-f-eh-r-m-ah-t-ih-v
all          aw-l
alter        ah-l-t-r
an           ae-n
and          ae-n-d
Andy         ae-n-d-ee
any          eh-n-ee
anybody      ae-n-ee-b-ah-d-ee
at           ae-t
attacked     uh-t-ae-k-t

I want to create a function that can be applied to text and return the phonemes corresponding to this text using the conversion rules.

A rule consists of a few parts. The first part is the text token under consideration. The second part is the text token found before the token under consideration. The third part is the text token found after the token under consideration. The fourth part is the appropriate phoneme that should result in the conversion. Rules can be written in the following way, with the different parts separated by slashes:

text found/text before text found/text after text found/phoneme

Given rules of this form, what would be a good way to apply them to strings of text? I want to try to build a function that can parse text to find a rule match.

Rules are as follows:

#  one or more vowels (AEIOUY)
+  one of E, I, Y (a front vowel)
:  zero or more consonants (BCDFGHJKLMNPQRSTVWXZ)
^  one consonant
.  one of B, V, D, G, J, L, M, N, R, W, Z (a voiced consonant)
%  one of ER, E, ES, ED, ING, ELY (a suffix)
&  one of S, C, G, Z, X, J, CH, SH (a siblant)
@  one of T, S, R, D, L, Z, N, J, TH, CH, SH (a consonant influencing following u)

" /// "
"A// /UH"
"ARE/ / /AH-R"
"AR/ /O/UH-R"
"AR//#/EH-R"
"AS/ ^/#/AE-A-S"
"A//WA/UH"
"AW///AW"
"ANY/ ://EH-N-EE"
"A//^+#/AE-A"
"ALLY/#://UH-L-EE"
"AL/ /#/UH-L"
"AGAIN///UH-G-EH-N"
"AG/#:/E/IH-J"
"A//^+:#/AE"
"A/ :/^+/AE-A"
"ARR/ //UH-R"
"ARR///AE-R"
"AR/ ://AH-R"
"AR// /AE-R"
"AR///AH-R"
"AIR///EH-R"
"AI///AE-A"
"AY///AE-A"
"AU///AW"
"AL/#:/ /UH-L"
"ALS/#:/ /UH-L-Z"
"ALK///AW-K"
"AL//^/AW-L"
"ABLE/ ://AE-A-B-UH-L"
"ABLE///UH-B-UH-L"
"ANG//+/AE-A-N-J"
"ATHE/ C/ /AE-TH-EE"
"A//A/AH"
"A///AE"
"BE/ /^#/B-IH"
"BEING///B-EE-IH-N"
"BOTH/ / /B-OH-TH"
"BUS/ /#/B-IH-Z"
"BUIL///B-IH-L"
"B/ / /B-EE"
"B///B"
"CH/ /^/K"
"CH/^E//K"
"CH///CH"
"CI/ S/#/S-AH-EE"
"CI//A/SH"
"CI//O/SH"
"CI//EN/SH"
"C//+/S"
"CK///K"
"COM//%/K-AH-M"
"C/ / /S-EE"
"C///K"
"DED/#:/ /D-IH-D"
"D/.E/ /D"
"D/#^:E/ /T"
"DE/ /^#/D-IH"
"DO/ / /D-OO"
"DOES/ //D-UH-Z"
"DOING/ //D-OO-IH-N"
"DOW/ //D-OH"
"DU//A/J-OO"
"D/ / /D-EE"
"DOUGH///D-OH"
"D///D"
"E/#:/ /"
"E/'^:/ /"
"E/ :/ /EE"
"ED/#/ /D"
"E/#:/D /"
"ER//EV/EH-V"
"EVEN/ EL//EH-V-EH-N"
"EVEN/ S//EH-V-EH-N"
"E//^%/EE"
"E//PH%/EE"
"ERI//#/EE-R-EE"
"ER/#:/#/AE-R"
"ER//#/EH-R"
"ER///AE-R"
"EVEN/ //EE-V-EH-N"
"E/#:/W/"
"EW/@//OO"
"EW///Y-OO"
"E//O/EE"
"ES/#:&/ /IH-Z"
"E/#:/S /"
"ELY/#://L-EE"
"EMENT/#://M-EH-N-T"
"EFUL///F-U-L"
"EE///EE"
"EARN///AE-R-N"
"EAR/ /^/AE-R"
"EAD///EH-D"
"EA/#:/ /EE-UH"
"EA//SU/EH"
"EA///EE"
"EIGH///AE-A"
"EI///EE"
"EYE/ //AH-EE"
"EY///EE"
"EU///Y-OO"
"E/ / /EE"
"E/^/ /"
"E///EH"
"FUL///F-U-L"
"F/F//"
"F/ / /EH-F"
"F///F"
"GIV///G-IH-V"
"G/ /I^/G"
"GE//T/G-EH"
"GGES/SU//G-J-EH-SS"
"G/G//"
"G/ B#//G"
"G//+/J"
"GREAT///G-R-AE-A-T"
"GH/#//"
"G/ / /G-EE"
"G///G"
"HAV/ //H-AE-V"
"HERE/ //H-EE-R"
"HOUR/ //OH-AE-R"
"HOW///H-OH"
"H//#/H"
"H/ / /H-AE-CH"
"H///"
"IN/ //IH-N"
"I/ / /AH-EE"
"IN//D/IH-N"
"IER///EE-AE-R"
"IED/#:R//EE-D"
"IED// /AH-EE-D"
"IEN///EE-EH-N"
"IE//T/AH-EE-EH"
"I/ :/%/AH-EE"
"I//%/EE"
"IE///EE"
"INE/N//AH-EE-N"
"IME/T//AH-EE-M"
"I//^+:#/IH"
"IR//#/AH-EE-R"
"IS//%/AH-EE-S"
"IX//%/IH-K-S"
"IZ//%/AH-EE-Z"
"I//D%/AH-EE"
"I/+^/^+/IH"
"I//T%/AH-EE"
"I/#^:/^+/IH"
"I//^+/AH-EE"
"IR///AE-R"
"IGH///AH-EE"
"ILD///AH-EE-L-D"
"IGN// /AH-EE-N"
"IGN//^/AH-EE-N"
"IGN//%/AH-EE-N"
"IQUE///EE-K"
"I///IH"
"J/ / /J-A-EE"
"J///J"
"K//N/"
"K/ / /K-A-EE"
"K///K"
"LO//C#/L-OH"
"L/L//"
"L/#^:/%/UH-L"
"LEAD///L-EE-D"
"L/ / /AE-L"
"L///L"
"MOV///M-OO-V"
"M/ / /EH-M"
"M///M"
"NG/E/+/N-J"
"NG//R/N"
"NG//#/N"
"NGL//%/N-UH-L"
"NG///N"
"NK///N-K"
"NOW/ / /N-OH"
"N/ / /EH-N"
"N/N//"
"N///N"
"OF// /UH-V"
"OROUGH///AE-R-OH"
"OR/ F/TY/OH-R"
"OR/#:/ /AE-R"
"ORS/#:/ /AE-R-Z"
"OR///AW-R"
"ONE/ //W-UH-N"
"OW//EL/OH"
"OW///OH"
"OVER/ //OH-V-AE-R"
"OV///UH-V"
"O//^%/OH"
"O//^EN/OH"
"O//^I#/OH"
"OL//D/OH-L"
"OUGHT///AH-T"
"OUGH///UH-F"
"OU/ /^L/UH"
"OU/ //OH"
"OU/H/S#/OH"
"OUS///UH-S"
"OUR/ F//OH-R"
"OUR///AW-R"
"OUD///U-D"
"OUP///OO-P"
"OU///OH"
"OY///AW-EE"
"OING///OH-IH-N"
"OI///AW-EE"
"OOR///OH-R"
"OOK///U-K"
"OOD///U-D"
"OO///OO"
"O//E/OH"
"O// /OH"
"OA// /OH"
"ONLY/ //OH-N-L-EE"
"ONCE/ //W-UH-N-S"
"ON'T// /OH-N-T"
"O/C/N/AH"
"O//NG/AH"
"O/^:/N/UH"
"ON/I//UH-N"
"ON/#:/ /UH-N"
"ON/#^//UH-N"
"O//ST /OH"
"OF//^/AW-F"
"OTHER///UH-TH-AE-R"
"OSS// /AW-S"
"OM/#^:/ /UH-M"
"O///AH"
"PH///F"
"PEOP///P-EE-P"
"POW///P-OH"
"PUT// /P-U-T"
"P/ / /P-EE"
"P/P//"
"P///P"
"QUAR///K-W-AW-R"
"QU/ //K-W"
"QU///K"
"Q/ / /K-OO"
"Q///K"
"RE/ /^#/R-EE"
"R/ / /AH"
"R/R//"
"R///R"
"SH///SH"
"SION/#//ZH-UH-N"
"SOME///S-AH-M"
"SUR/#/#/ZH-AE-R"
"SUR//#/SH-AE-R"
"SU/#/#/ZH-OO"
"SSU/#/#/SH-OO"
"SED/#/ /Z-D"
"S/#/#/Z"
"SAID///S-EH-D"
"SION/^//SH-UH-N"
"S/S//"
"S/./ /Z"
"S/#:.E/ /Z"
"S/#^:##/ /Z"
"S/#^:#/ /S"
"S/U/ /S"
"S/ :#/ /Z"
"SCH/ //S-K"
"S//C+/"
"SM/#//Z-M"
"SN/#/ /Z-UH-N"
"S/ / /EH-S"
"S///S"
"THE/ / /TH-UH"
"TO// /T-OO"
"THAT///TH-AE-T"
"THIS/ / /TH-IH-S"
"THEY/ //TH-AE-A"
"THERE/ //TH-EH-R"
"THER///TH-AE-R"
"THEIR///TH-EH-EH"
"THAN/ / /TH-AE-N"
"THEM/ / /TH-EH-M"
"THESE// /TH-EE-Z"
"THEN/ //TH-EH-N"
"THROUGH///TH-R-OO"
"THOSE///TH-OH-Z"
"THOUGH// /TH-OH"
"THUS/ //TH-UH-S"
"TH///TH"
"TED/#:/ /T-IH-D"
"TI/S/#N/CH"
"TI//O/SH"
"TI//A/T"
"TIEN///SH-UH-N"
"TUR//#/CH-AE-R"
"TU//A/CH-OO"
"TWO/ //T-OO"
"T/ / /T-EE"
"T/T//"
"T///T"
"UN/ /I/Y-OO-N"
"UN/ //UH-N"
"UPON/ //UH-P-AW-N"
"UR/@/#/AE-R"
"UR//#/Y-AE-R"
"UR///AE-R"
"U//^ /UH"
"U//^^/UH"
"UY///AH-EE"
"U/ G/#/"
"U/G/%/"
"U/G/#/W"
"U/#N//Y-OO"
"UI/@//OO"
"U/@//UH"
"U///Y-OO"
"VIEW///V-Y-OO"
"V/ / /V-EE"
"V///V"
"WHERE/ //W-AE-R"
"WA//S/W-AH"
"WA//T/W-AH"
"WHERE///WH-EH-R"
"WHAT///WH-AH-T"
"WHOL///H-OH-L"
"WHO///H-OO"
"WH///WH"
"WAR///W-AH-R"
"WOR///W-AE-R"
"WR///R"
"W/ / /D-AH-B-L-Y-OO"
"W///W"
"X//^/EH-K-S"
"X/ / /EH-K-S"
"X/ /#/Z-EH"
"X///K-S"
"YOUNG///Y-UH-N"
"YOU/ //Y-OO"
"YES/ //Y-EH-S"
"Y/ / /WH-UH-Y"
"Y/ //Y"
"Y/#^:/ /EE"
"Y/#^:/I/EE"
"Y/ :/ /AH-EE"
"Y/ :/#/AH-EE"
"Y/ :/^+:#/IH"
"Y/ :/^#/AH-EE"
"Y///IH"
"ZZ///T-Z"
"Z/ / /Z-EH-D"
"Z///Z"

Those aren't phonemes.. (the first phoneme in both _abort_ and _affirmative_, is schwa, and I believe the t in affirmative is pronounced _d_ by most US dialects)... That being said, regular expressions with pre and postfix rules would be a quick way to get started.. — thebjorn, Oct 25 '15 at 18:32
@thebjorn Thanks for your suggestion. They are a fairly minimal set of phonemes, but they're not as detailed as something like the IPA. Would you happen to have a link to an example of your suggestion of using a regex expression with pre and postfix rules to find the best matches for some text? — d3pd, Oct 25 '15 at 18:42

score 3 · Accepted Answer · answered Oct 26 '15 at 01:18

Turns out that lookbehind requires the pattern to be of fixed size, which doesn't fit with your rules, so we have to be a little bit more complex.

First let's define a translation between your syntax and regex:

rule_syntax = {
    '#': r'[AEIOUY]+',
    '+': r'[EIY]',
    ':': r'[BCDFGHJKLMNPQRSTVWXZ]*',
    '^': r'[BCDFGHJKLMNPQRSTVWXZ]',
    '.': r'[BVDGJLMNRWZ]',
    '%': r'(?:ER|E|ES|ED|ING|EL)',
    '&': r'(?:[SCGZXJ]|CH|SH)',
    '@': r'(?:[TSRDLZNJ]|TH|CH|SH)',
}

and a function to create a regex fragment from this mapping:

def mkregex(rule):
    regex = r""
    for ch in rule:
        regex += rule_syntax.get(ch, ch)
    return regex

I'm not sure how you want to handle rules with spaces, I've commented out the ' /// ' rule to get the results below.

Now we implement a function that converts your rule syntax into an "interesting" tuple:

def mkrule(ruletxt):
    txt, before, after, phoneme = ruletxt.split('/')
    rule = r""

    if before:
        # use a non-capturing group to match the 'before' text
        rule += r'(?:' + mkregex(before) + ')'

    # create a capturing group for the text in question
    rule += r'(?P<found>' + txt + ')'  

    if after:
        # add a lookahead pattern
        rule += r'(?=' + mkregex(after) + ')'

    # return a tuple containing
    #   - the regex created from the rule
    #   - a lower-cased version of the phonemes between dashes
    #   - the original rule (for explaining and debugging)
    return rule, "-%s-" % phoneme.lower(), ruletxt

The approach we will take is to iteratively replace matched rules with phonemes. To make sure we don't replace text that has already been converted (i.e. phonemes), we will make the input string upper cased, and the phonemes lower cased. To prevent the phonemes from running into each other we've added a - on each side (we'll have to clean this up at the end).

Convert all your rules to interesting tuples:

rules = [mkrule(r) for r in [
    #" /// ",          # this rule creates problems
    "A// /UH",
    "ARE/ / /AH-R",
    "AR/ /O/UH-R",
    "AR//#/EH-R",
    "AS/ ^/#/AE-A-S",
    "A//WA/UH",
    "AW///AW",
    ...
]]

We're almost there, just a function to replace the found text from a single rule:

def match_and_replace(word, rule, phonemes):
    # a rule can match multiple times, find all of them
    matches = [(m.start(), m.end()) for m in re.finditer(rule, word)]
    matches.reverse()  # we're going to replace in-place, so start from behind
    chars = list(word)  # convert to list of chars since strings are immutable
    for start, end in matches:
        chars[start:end] = phonemes
    return ''.join(chars)  # convert back to string

Finally, the function to extract 'phonemes' from a word:

def phonemes(word, explain=False):
    # rule engines should always be able to explain their results ;-)
    if explain:
        print "word  :", word

    result = " %s " % word.upper()  # add space around word to give the rules containing spaces something to work with
    step = 0

    # iterate over all the interesting tuples
    for rule, phoneme, ruletxt in rules:
        # for each rule, tmp is the string where all matches for `rule` have been replaced by `phoneme`
        tmp = match_and_replace(result, rule, phoneme)
        if explain and tmp != result:
            step += 1
            print 'step %d: %r ---> %r  [rule: %r (%r)]' % (
                step, result, tmp, ruletxt, rule
            )
        result = tmp

    # remove artifacts
    res, _count = re.subn(r'-+', '-', result.replace(' ', '').strip('-'))
    if explain:
        print "result:", res
        print
    return res

With this I get the following results:

>>> phonemes('abort', explain=True)
word  : abort
step 1: ' ABORT ' ---> ' -ae-BORT '  [rule: 'A///AE' ('(?P<found>A)')]
step 2: ' -ae-BORT ' ---> ' -ae--b-ORT '  [rule: 'B///B' ('(?P<found>B)')]
step 3: ' -ae--b-ORT ' ---> ' -ae--b--aw-r-T '  [rule: 'OR///AW-R' ('(?P<found>OR)')]
step 4: ' -ae--b--aw-r-T ' ---> ' -ae--b--aw-r--t- '  [rule: 'T///T' ('(?P<found>T)')]
result: ae-b-aw-r-t

You'll need to order the rules sensibly to get the results you want, or use more complex algorithms that can find all possible rule-permutations that match and then find the best one.

Python pattern matching of rules to parse text to phonemes

1 Answers1