1

I have a fairly simple grammar designed to parse URIs. It is compiled with the help of antlr4-maven-plugin. Compiling produces no warnings or errors. I wrote a simple test.

Uri.g4:

/**
 * Uniform Resource Identifier (RFC 3986).
 *
 * @author Oliver Yasuna
 * @see <a href="https://www.rfc-editor.org/rfc/rfc3986.html">RFC 3986</a>
 * @since 1.0.0
 */

grammar Uri;

options {
  tokenVocab = Common;
}

@header {
  package com.oliveryasuna.http.antlr;
}

// Parser
//--------------------------------------------------

pctEncoded
  : '%' HEXDIG HEXDIG
  ;

reserved
  : genDelims | subDelims
  ;

genDelims
  : ':' | '/' | '?' | '#' | '[' | ']' | '@'
  ;

subDelims
  : '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
  ;

unreserved
  : ALPHA | DIGIT | '-' | '.' | '_' | '~'
  ;

uri
  : scheme ':' hierPart ('?' query)? ('#' fragment_)?
  ;

hierPart
  : '//' authority pathAbEmpty
  | pathAbsolute
  | pathRootless
  | pathEmpty
  ;

scheme
  : ALPHA (ALPHA | DIGIT | '+' | '-' | '.')*
  ;

authority
  : (userinfo '@')? host (':' port)?
  ;

userinfo
  : (unreserved | pctEncoded | subDelims | ':')*
  ;

host
  : ipLiteral
  | ipv4Address
  | regName
  ;

ipLiteral
  : '[' (ipv6Address | ipvFuture) ']'
  ;

ipvFuture
  : 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+
  ;

ipv6Address
:                                                                            '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
|                                                                            '::'           (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                                    h16?  '::'                     (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                        ((h16 ':')? h16)? '::'                               (h16 ':') (h16 ':') (h16 ':') ls32
  |                                             ((h16 ':')? (h16 ':')? h16)? '::'                                         (h16 ':') (h16 ':') ls32
  |                                  ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                    h16 ':'  ls32
  |                       ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             ls32
  |            ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             h16
  | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'
  ;

ls32
  : (h16 ':' h16)
  | ipv4Address
  ;

h16
  : HEXDIG HEXDIG? HEXDIG? HEXDIG?
  ;

ipv4Address
  : decOctet '.' decOctet '.' decOctet '.' decOctet
  ;

decOctet
  : DIGIT
  | ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT
  | '1' DIGIT DIGIT
  | '2' ('0' | '1' | '2' | '3' | '4') DIGIT
  | '2' '5' ('0' | '1' | '2' | '3' | '4' | '5')
  ;

regName
  : (unreserved | pctEncoded | subDelims)*
  ;

port
  : DIGIT*
  ;

path
  : pathAbEmpty
  | pathAbsolute
  | pathNoScheme
  | pathRootless
  | pathEmpty
  ;

pathAbEmpty
  : ('/' segment)*
  ;

pathAbsolute
  : '/' (segmentNz ('/' segment)?)?
  ;

pathNoScheme
  : segmentNzNc ('/' segment)?
  ;

pathRootless
  : segmentNz ('/' segment)?
  ;

pathEmpty
  : // TODO: 0<pchar>.
  ;

segment
  : pchar*
  ;

segmentNz
  : pchar+
  ;

segmentNzNc
  : (unreserved | pctEncoded | subDelims | '@')+
  ;

pchar
  : unreserved | pctEncoded | subDelims | ':' | '@'
  ;

query
  : (pchar | '/' | '?')*
  ;

fragment_
  : (pchar | '/' | '?')*
  ;

uriReference
  : uri
  | relativeRef
  ;

relativeRef
  : relativePart ('?' query)? ('#' fragment_)?
  ;

relativePart
  : '//' authority pathAbEmpty
  | pathAbEmpty
  | pathNoScheme
  | pathEmpty
  ;

absoluteUri
  : scheme ':' hierPart ('?' query)?
  ;

Common.g4:

lexer grammar Common;

// ASCII
//--------------------------------------------------

BANG                  : '!'  ;
//DOUBLE_QUOTE          : '"'  ;
HASH                  : '#'  ;
DOLLAR                : '$'  ;
PERCENT               : '%'  ;
AND                   : '&'  ;
SINGLE_QUOTE          : '\'' ;
LEFT_PARENTHESES      : '('  ;
RIGHT_PARENTHESES     : ')'  ;
STAR                  : '*'  ;
PLUS                  : '+'  ;
COMMA                 : ','  ;
MINUS                 : '-'  ;
DOT                   : '.'  ;
SLASH                 : '/'  ;
COLON                 : ':'  ;
SEMICOLON             : ';'  ;
LEFT_ANGLE_BRACKET    : '<'  ;
EQUAL                 : '='  ;
RIGHT_ANGLE_BRACKET   : '>'  ;
QUESTION              : '?'  ;
AT                    : '@'  ;
LEFT_SQUARE_BRACKET   : '['  ;
BACKSLASH             : '\\' ;
RIGHT_SQUARE_BRACKET  : ']'  ;
CARROT                : '^'  ;
UNDERSCORE            : '_'  ;
BACKTICK              : '`'  ;
LEFT_CURLY_BRACKET    : '{'  ;
BAR                   : '|'  ;
RIGHT_CURLY_BRACKET   : '}'  ;
TILDE                 : '~'  ;

// Core
//--------------------------------------------------

// Taken from ABNF.
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;
DQUOTE  : '"'                   ;
SP      : ' '                   ;
HTAB    : '\t'                  ;
WSP     : SP | HTAB             ;
//LWSP    : (WSP | CRLF WSP)*     ;
VCHAR   : [\u0021-\u007F]       ;
CHAR    : [\u0001-\u007F]       ;
OCTET   : [\u0000-\u00FF]       ;
CTL     : [\u0000-\u001F\u007F] ;
CR      : '\r'                  ;
LF      : '\n'                  ;
CRLF    : CR LF                 ;
BIT     : '0' | '1'             ;

// Miscellaneous
//--------------------------------------------------

DOUBLE_SLASH  : '//' ;
DOUBLE_COLON  : '::' ;

LOWER_V       : 'v'  ;

ZERO          : '0'  ;
ONE           : '1'  ;
TWO           : '2'  ;
THREE         : '3'  ;
FOUR          : '4'  ;
FIVE          : '5'  ;
SIX           : '6'  ;
SEVEN         : '7'  ;
EIGHT         : '8'  ;
NINE          : '9'  ;

Test method:

@Test
final void google() {
  final String uri = "https://www.google.com/";

  final UriLexer lexer = new UriLexer(new ANTLRInputStream(uri));
  final UriParser parser = new UriParser(new CommonTokenStream(lexer));

  parser.addErrorListener(new BaseErrorListener() {
    @Override
    public void syntaxError(final Recognizer<?, ?> recognizer, final Object offendingSymbol, final int line, final int charPositionInLine, final String msg, final RecognitionException e) {
      throw new IllegalStateException("[" + line + ":" + charPositionInLine + "] Symbol [" + offendingSymbol + "] produced error: " + msg + ".", e);
    }
  });

  Assertions.assertDoesNotThrow(parser::uri);
}

I get the following errors when I input https://www.google.com/.

I have absolute no idea what is causing these parsing errors. Does anyone have an idea?

Output:

line 1:0 token recognition error at: 'h'
line 1:1 token recognition error at: 't'
line 1:2 token recognition error at: 't'
line 1:3 token recognition error at: 'p'
line 1:4 token recognition error at: 's'
line 1:5 missing '6' at ':'
Oliver
  • 1,465
  • 4
  • 17

2 Answers2

2

ANTLR's lexer has a strict separation between parsing and tokenizing/lexing. The lexer also works independently from the parser and creates tokens based on 2 simple rules:

  1. try to consume as many characters for a single lexer rule
  2. when 2 or more lexer rules match the same characters, let the one defined first "win"

If we now look at your rules:

ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;

it is clear that the lexer rule HEXDIG will never be matched because either ALPHA or DIGIT will match whatever HEXDIG matches and are defined before HEXDIG. Switching the order:

HEXDIG  : [0-9a-fA-F]           ;
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;

will not work because any digit will now never become a DIGIT token, and a F will now also never become a ALPHA.

Note that this is just a single example: there are more of such cases in you lexer grammar.

A solution would be to move some of the responsibility to the parser instead of the lexer:

A : [aA];
B : [bB];
C : [cC];
D : [dD];
E : [eE];
F : [fF];
G : [gG];
H : [hH];
I : [iI];
J : [jJ];
K : [kK];
L : [lL];
M : [mM];
N : [nN];
O : [oO];
P : [pP];
Q : [qQ];
R : [rR];
S : [sS];
T : [tT];
U : [uU];
V : [vV];
W : [wW];
X : [xX];
Y : [yY];
Z : [zZ];

D0 : '0';
D1 : '1';
D2 : '2';
D3 : '3';
D4 : '4';
D5 : '5';
D6 : '6';
D7 : '7';
D8 : '8';
D9 : '9';

and then in the parser you do:

alpha
 : A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z
 ;

digit
 : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9
 ;

hexdig
 : A | B | C | D | E | F | digit
 ;

Also, remove all the literal tokens like '6' from the parser and use the proper lexer rule instead (D6, in this case). Whenever the parser sees such a literal token, which is not defined in the lexer, it "magically" creates a new token for it, resulting in mysterious error/warning messages. Best to remove all (and I mean all!) such literal token from the parser.

Bart Kiers
  • 166,582
  • 36
  • 299
  • 288
  • Is there a solution that allows for `alpha`, `digit`, and `hexdig` in the lexer? I would like to use a shared lexer across multiple parsers. I'd like this for when code is generated. – Oliver Jun 20 '22 at 14:56
  • No, alas, in your case, there is not. – Bart Kiers Jun 20 '22 at 18:17
  • Thanks anyway. I was able to use them as parser rules in a shared grammar. – Oliver Jun 20 '22 at 23:01
1

In addition to the answer Bart made on the grammar--all correct--this is not how to write a split grammar!

You must have "parser grammar UriParser;" in UriParser.g4 (rename Uri.g4 to UriParser.g4), and "lexer grammar UriLexer;" in UriLexer.g4 (rename Common.g4 to UriLexer.g4).

If you try to generate the parser for your original "split" grammar, you get three .tokens files generated by the Antlr tool, all different in size and contents. That indicates there is likely no coordination of the token types between the lexer and parser. That doesn't have anything to do with the "token recognition error" because as Bart says, the lexer operates completely independently from the parser. But, it will have an impact when you start testing the grammar productions with other input.

Also, you should never include @header { package ...; } in the grammar. You need to the -package option instead. Using the @header makes the grammar completely unportable to other targets, and creates a problem if you have multiple grammars in one directory, some with the @header and some without.

If you fix these problems, the code parses your input--with the caveat that your lexer rules are not correct (see Bart's answer).

It's not clear why you split the grammar to begin with.

UriParser.g4:

/**
 * Uniform Resource Identifier (RFC 3986).
 *
 * @author Oliver Yasuna
 * @see <a href="https://www.rfc-editor.org/rfc/rfc3986.html">RFC 3986</a>
 * @since 1.0.0
 */

parser grammar UriParser;

options {
  tokenVocab = UriLexer;
}

// Parser
//--------------------------------------------------

pctEncoded
  : '%' HEXDIG HEXDIG
  ;

reserved
  : genDelims | subDelims
  ;

genDelims
  : ':' | '/' | '?' | '#' | '[' | ']' | '@'
  ;

subDelims
  : '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
  ;

unreserved
  : ALPHA | DIGIT | '-' | '.' | '_' | '~'
  ;

uri
  : scheme ':' hierPart ('?' query)? ('#' fragment_)?
  ;

hierPart
  : '//' authority pathAbEmpty
  | pathAbsolute
  | pathRootless
  | pathEmpty
  ;

scheme
  : ALPHA (ALPHA | DIGIT | '+' | '-' | '.')*
  ;

authority
  : (userinfo '@')? host (':' port)?
  ;

userinfo
  : (unreserved | pctEncoded | subDelims | ':')*
  ;

host
  : ipLiteral
  | ipv4Address
  | regName
  ;

ipLiteral
  : '[' (ipv6Address | ipvFuture) ']'
  ;

ipvFuture
  : 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+
  ;

ipv6Address
:                                                                            '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
|                                                                            '::'           (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                                    h16?  '::'                     (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                        ((h16 ':')? h16)? '::'                               (h16 ':') (h16 ':') (h16 ':') ls32
  |                                             ((h16 ':')? (h16 ':')? h16)? '::'                                         (h16 ':') (h16 ':') ls32
  |                                  ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                    h16 ':'  ls32
  |                       ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             ls32
  |            ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             h16
  | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'
  ;

ls32
  : (h16 ':' h16)
  | ipv4Address
  ;

h16
  : HEXDIG HEXDIG? HEXDIG? HEXDIG?
  ;

ipv4Address
  : decOctet '.' decOctet '.' decOctet '.' decOctet
  ;

decOctet
  : DIGIT
  | ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT
  | '1' DIGIT DIGIT
  | '2' ('0' | '1' | '2' | '3' | '4') DIGIT
  | '2' '5' ('0' | '1' | '2' | '3' | '4' | '5')
  ;

regName
  : (unreserved | pctEncoded | subDelims)*
  ;

port
  : DIGIT*
  ;

path
  : pathAbEmpty
  | pathAbsolute
  | pathNoScheme
  | pathRootless
  | pathEmpty
  ;

pathAbEmpty
  : ('/' segment)*
  ;

pathAbsolute
  : '/' (segmentNz ('/' segment)?)?
  ;

pathNoScheme
  : segmentNzNc ('/' segment)?
  ;

pathRootless
  : segmentNz ('/' segment)?
  ;

pathEmpty
  : // TODO: 0<pchar>.
  ;

segment
  : pchar*
  ;

segmentNz
  : pchar+
  ;

segmentNzNc
  : (unreserved | pctEncoded | subDelims | '@')+
  ;

pchar
  : unreserved | pctEncoded | subDelims | ':' | '@'
  ;

query
  : (pchar | '/' | '?')*
  ;

fragment_
  : (pchar | '/' | '?')*
  ;

uriReference
  : uri
  | relativeRef
  ;

relativeRef
  : relativePart ('?' query)? ('#' fragment_)?
  ;

relativePart
  : '//' authority pathAbEmpty
  | pathAbEmpty
  | pathNoScheme
  | pathEmpty
  ;

absoluteUri
  : scheme ':' hierPart ('?' query)?
  ;

UriLexer.g4:

lexer grammar UriLexer;

// ASCII
//--------------------------------------------------

BANG                  : '!'  ;
//DOUBLE_QUOTE          : '"'  ;
HASH                  : '#'  ;
DOLLAR                : '$'  ;
PERCENT               : '%'  ;
AND                   : '&'  ;
SINGLE_QUOTE          : '\'' ;
LEFT_PARENTHESES      : '('  ;
RIGHT_PARENTHESES     : ')'  ;
STAR                  : '*'  ;
PLUS                  : '+'  ;
COMMA                 : ','  ;
MINUS                 : '-'  ;
DOT                   : '.'  ;
SLASH                 : '/'  ;
COLON                 : ':'  ;
SEMICOLON             : ';'  ;
LEFT_ANGLE_BRACKET    : '<'  ;
EQUAL                 : '='  ;
RIGHT_ANGLE_BRACKET   : '>'  ;
QUESTION              : '?'  ;
AT                    : '@'  ;
LEFT_SQUARE_BRACKET   : '['  ;
BACKSLASH             : '\\' ;
RIGHT_SQUARE_BRACKET  : ']'  ;
CARROT                : '^'  ;
UNDERSCORE            : '_'  ;
BACKTICK              : '`'  ;
LEFT_CURLY_BRACKET    : '{'  ;
BAR                   : '|'  ;
RIGHT_CURLY_BRACKET   : '}'  ;
TILDE                 : '~'  ;

// Core
//--------------------------------------------------

// Taken from ABNF.
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;
DQUOTE  : '"'                   ;
SP      : ' '                   ;
HTAB    : '\t'                  ;
WSP     : SP | HTAB             ;
//LWSP    : (WSP | CRLF WSP)*     ;
VCHAR   : [\u0021-\u007F]       ;
CHAR    : [\u0001-\u007F]       ;
OCTET   : [\u0000-\u00FF]       ;
CTL     : [\u0000-\u001F\u007F] ;
CR      : '\r'                  ;
LF      : '\n'                  ;
CRLF    : CR LF                 ;
BIT     : '0' | '1'             ;

// Miscellaneous
//--------------------------------------------------

DOUBLE_SLASH  : '//' ;
DOUBLE_COLON  : '::' ;

LOWER_V       : 'v'  ;

ZERO          : '0'  ;
ONE           : '1'  ;
TWO           : '2'  ;
THREE         : '3'  ;
FOUR          : '4'  ;
FIVE          : '5'  ;
SIX           : '6'  ;
SEVEN         : '7'  ;
EIGHT         : '8'  ;
NINE          : '9'  ;
kaby76
  • 1,142
  • 1
  • 7
  • 10
  • See my second comment on Bart's answer for why I split. – Oliver Jun 20 '22 at 15:07
  • @Oliver In that case I would recommend that you use "import" instead of trying to use a common generated lexer. Just make Uri.g4 and Common.g4 both combined grammars ("grammar Uri;" and "grammar Common;", respectively). – kaby76 Jun 20 '22 at 15:32
  • Got it, thank you. Also, I tried specifying `-package "com.oliveryasuna.http.antlr"` in the command line, but got an error: `error(2): unknown command-line option -package "com.oliveryasuna.http.antlr"`. Tried without quotes too. I see it is an [option](https://github.com/antlr/antlr4/blob/master/doc/tool-options.md#-package), so I do not know what's wrong. – Oliver Jun 20 '22 at 15:39
  • @Oliver `java -jar /c/Users/Kenne/Downloads/antlr4-4.10.1-complete.jar -package "com.oliveryasuna.http.antlr" *.g4` works for me. `grep package *.java` gives all the Java package declarations. How are you calling the Antlr tool? – kaby76 Jun 20 '22 at 15:56
  • 1
    I figured it out. Was a formatting error when using the `antlr4-maven-plugin`. – Oliver Jun 20 '22 at 16:01