How do you modify the default spacy (v3.0.5) tokenizer to correctly split english contractions if unicode apostrophes (not '
) are used.
import spacy
nlp = spacy.load('en_core_web_sm')
apostrophes = ["'",'\u02B9', '\u02BB', '\u02BC', '\u02BD', '\u02C8', '\u02CA', '\u02CB', '\u0060', '\u00B4']
for apo in apostrophes:
text = f"don{apo}t"
print([t for t in nlp(text)])
>>>
[do, n't]
[donʹt]
[donʻt]
[donʼt]
[donʽt]
[donˈt]
[donˊt]
[donˋt]
[don`t]
[don´t]
The desired output for all examples is [do, n't]
My best guess was to extend the default tokenizer_exceptions with all possible apostrophe variations. But this does not work as Tokenizer special cases are not allowed to modify text.
import spacy
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
nlp = spacy.load('en_core_web_sm')
apostrophes = ['\u02B9', '\u02BB', '\u02BC', '\u02BD', '\u02C8', '\u02CA', '\u02CB', '\u0060', '\u00B4']
default_rules = nlp.Defaults.tokenizer_exceptions
extended_rules = default_rules.copy()
for key, val in default_rules.items():
if "'" in key:
for apo in apostrophes:
extended_rules[key.replace("'", apo)] = val
rules = nlp.Defaults.tokenizer_exceptions
infix_re = compile_infix_regex(nlp.Defaults.infixes)
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
nlp.tokenizer = spacy.tokenizer.Tokenizer(
nlp.vocab,
rules = extended_rules,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
)
apostrophes = ["'",'\u02B9', '\u02BB', '\u02BC', '\u02BD', '\u02C8', '\u02CA', '\u02CB', '\u0060', '\u00B4']
for apo in apostrophes:
text = f"don{apo}t"
print([t for t in nlp(text)])
>>> ValueError: [E997] Tokenizer special cases are not allowed to modify the text. This would map ':`(' to ':'(' given token attributes '[{65: ":'("}]'.