This is not an answer but a large comment.
I just hit a snag with Unicode, so I thought I would test this. Turned out I wrongly encoded the input file, but here is the test code, everything is default and working extremely well in ANTLR 4.10.1. Maybe of some use:
grammar LetterNumbers;
text: WORD*;
WS: [ \t\r\n]+ -> skip ; // toss out whitespace
// The letters that return Character.LETTER_NUMBER to Character.getType(ch)
// The list: https://www.compart.com/en/unicode/category/Nl
// Roman Numerals are the best known here
WORD: LETTER_NUMBER+;
LETTER_NUMBER:
[\u16ee-\u16f0]|[\u2160-\u2182]|[\u2185-\u2188]
|'\u3007'
|[\u3021-\u3029]|[\u3038-\u303a]|[\ua6e6-\ua6ef];
And the JUnit5 test that goes with that:
package antlerization.minitest;
import antlrgen.minitest.LetterNumbersBaseListener;
import antlrgen.minitest.LetterNumbersLexer;
import antlrgen.minitest.LetterNumbersParser;
import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.tree.TerminalNode;
import org.junit.jupiter.api.Test;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.runtime.tree.ParseTreeWalker;
import java.util.LinkedList;
import java.util.List;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;
public class MiniTest {
static class WordCollector extends LetterNumbersBaseListener {
public final List<String> collected = new LinkedList<>();
@Override
public void exitText(LetterNumbersParser.TextContext ctx) {
for (TerminalNode tn : ctx.getTokens(LetterNumbersLexer.WORD)) {
collected.add(tn.getText());
}
}
}
private static ParseTree stringToParseTree(String inString) {
Lexer lexer = new LetterNumbersLexer(CharStreams.fromString(inString));
CommonTokenStream tokens = new CommonTokenStream(lexer);
// "text" is the root of the grammar tree
// this returns a sublcass of ParseTree: LetterNumbersParser.TextContext
return (new LetterNumbersParser(tokens)).text();
}
private static List<String> collectWords(ParseTree parseTree) {
WordCollector wc = new WordCollector();
(new ParseTreeWalker()).walk(wc, parseTree);
return wc.collected;
}
private static String joinForTest(List<String> list) {
return String.join(",",list);
}
private static String stringInToStringOut(String parseThis) {
return joinForTest(collectWords(stringToParseTree(parseThis)));
}
@Test
void unicodeCharsOneWord() {
String res = stringInToStringOut("ⅣⅢⅤⅢ");
assertThat(res,equalTo("ⅣⅢⅤⅢ"));
}
@Test
void escapesOneWord() {
String res = stringInToStringOut("\u2163\u2162\u2164\u2162");
assertThat(res,equalTo("ⅣⅢⅤⅢ"));
}
@Test
void unicodeCharsMultipleWords() {
String res = stringInToStringOut("ⅠⅡⅢ ⅣⅤⅥ ⅦⅧⅨ ⅩⅪⅫ ⅬⅭⅮⅯ");
assertThat(res,equalTo("ⅠⅡⅢ,ⅣⅤⅥ,ⅦⅧⅨ,ⅩⅪⅫ,ⅬⅭⅮⅯ"));
}
@Test
void unicodeCharsLetters() {
String res = stringInToStringOut("Ⅰ Ⅱ Ⅲ \n Ⅳ Ⅴ Ⅵ \n Ⅶ Ⅷ Ⅸ \n Ⅹ Ⅺ Ⅻ \n Ⅼ Ⅽ Ⅾ Ⅿ");
assertThat(res,equalTo("Ⅰ,Ⅱ,Ⅲ,Ⅳ,Ⅴ,Ⅵ,Ⅶ,Ⅷ,Ⅸ,Ⅹ,Ⅺ,Ⅻ,Ⅼ,Ⅽ,Ⅾ,Ⅿ"));
}
}