I have a fairly simple grammar designed to parse URIs. It is compiled with the help of antlr4-maven-plugin
. Compiling produces no warnings or errors. I wrote a simple test.
Uri.g4
:
/**
* Uniform Resource Identifier (RFC 3986).
*
* @author Oliver Yasuna
* @see <a href="https://www.rfc-editor.org/rfc/rfc3986.html">RFC 3986</a>
* @since 1.0.0
*/
grammar Uri;
options {
tokenVocab = Common;
}
@header {
package com.oliveryasuna.http.antlr;
}
// Parser
//--------------------------------------------------
pctEncoded
: '%' HEXDIG HEXDIG
;
reserved
: genDelims | subDelims
;
genDelims
: ':' | '/' | '?' | '#' | '[' | ']' | '@'
;
subDelims
: '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
;
unreserved
: ALPHA | DIGIT | '-' | '.' | '_' | '~'
;
uri
: scheme ':' hierPart ('?' query)? ('#' fragment_)?
;
hierPart
: '//' authority pathAbEmpty
| pathAbsolute
| pathRootless
| pathEmpty
;
scheme
: ALPHA (ALPHA | DIGIT | '+' | '-' | '.')*
;
authority
: (userinfo '@')? host (':' port)?
;
userinfo
: (unreserved | pctEncoded | subDelims | ':')*
;
host
: ipLiteral
| ipv4Address
| regName
;
ipLiteral
: '[' (ipv6Address | ipvFuture) ']'
;
ipvFuture
: 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+
;
ipv6Address
: '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
| '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
| h16? '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
| ((h16 ':')? h16)? '::' (h16 ':') (h16 ':') (h16 ':') ls32
| ((h16 ':')? (h16 ':')? h16)? '::' (h16 ':') (h16 ':') ls32
| ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::' h16 ':' ls32
| ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::' ls32
| ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::' h16
| ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'
;
ls32
: (h16 ':' h16)
| ipv4Address
;
h16
: HEXDIG HEXDIG? HEXDIG? HEXDIG?
;
ipv4Address
: decOctet '.' decOctet '.' decOctet '.' decOctet
;
decOctet
: DIGIT
| ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT
| '1' DIGIT DIGIT
| '2' ('0' | '1' | '2' | '3' | '4') DIGIT
| '2' '5' ('0' | '1' | '2' | '3' | '4' | '5')
;
regName
: (unreserved | pctEncoded | subDelims)*
;
port
: DIGIT*
;
path
: pathAbEmpty
| pathAbsolute
| pathNoScheme
| pathRootless
| pathEmpty
;
pathAbEmpty
: ('/' segment)*
;
pathAbsolute
: '/' (segmentNz ('/' segment)?)?
;
pathNoScheme
: segmentNzNc ('/' segment)?
;
pathRootless
: segmentNz ('/' segment)?
;
pathEmpty
: // TODO: 0<pchar>.
;
segment
: pchar*
;
segmentNz
: pchar+
;
segmentNzNc
: (unreserved | pctEncoded | subDelims | '@')+
;
pchar
: unreserved | pctEncoded | subDelims | ':' | '@'
;
query
: (pchar | '/' | '?')*
;
fragment_
: (pchar | '/' | '?')*
;
uriReference
: uri
| relativeRef
;
relativeRef
: relativePart ('?' query)? ('#' fragment_)?
;
relativePart
: '//' authority pathAbEmpty
| pathAbEmpty
| pathNoScheme
| pathEmpty
;
absoluteUri
: scheme ':' hierPart ('?' query)?
;
Common.g4
:
lexer grammar Common;
// ASCII
//--------------------------------------------------
BANG : '!' ;
//DOUBLE_QUOTE : '"' ;
HASH : '#' ;
DOLLAR : '$' ;
PERCENT : '%' ;
AND : '&' ;
SINGLE_QUOTE : '\'' ;
LEFT_PARENTHESES : '(' ;
RIGHT_PARENTHESES : ')' ;
STAR : '*' ;
PLUS : '+' ;
COMMA : ',' ;
MINUS : '-' ;
DOT : '.' ;
SLASH : '/' ;
COLON : ':' ;
SEMICOLON : ';' ;
LEFT_ANGLE_BRACKET : '<' ;
EQUAL : '=' ;
RIGHT_ANGLE_BRACKET : '>' ;
QUESTION : '?' ;
AT : '@' ;
LEFT_SQUARE_BRACKET : '[' ;
BACKSLASH : '\\' ;
RIGHT_SQUARE_BRACKET : ']' ;
CARROT : '^' ;
UNDERSCORE : '_' ;
BACKTICK : '`' ;
LEFT_CURLY_BRACKET : '{' ;
BAR : '|' ;
RIGHT_CURLY_BRACKET : '}' ;
TILDE : '~' ;
// Core
//--------------------------------------------------
// Taken from ABNF.
ALPHA : [a-zA-Z] ;
DIGIT : [0-9] ;
HEXDIG : [0-9a-fA-F] ;
DQUOTE : '"' ;
SP : ' ' ;
HTAB : '\t' ;
WSP : SP | HTAB ;
//LWSP : (WSP | CRLF WSP)* ;
VCHAR : [\u0021-\u007F] ;
CHAR : [\u0001-\u007F] ;
OCTET : [\u0000-\u00FF] ;
CTL : [\u0000-\u001F\u007F] ;
CR : '\r' ;
LF : '\n' ;
CRLF : CR LF ;
BIT : '0' | '1' ;
// Miscellaneous
//--------------------------------------------------
DOUBLE_SLASH : '//' ;
DOUBLE_COLON : '::' ;
LOWER_V : 'v' ;
ZERO : '0' ;
ONE : '1' ;
TWO : '2' ;
THREE : '3' ;
FOUR : '4' ;
FIVE : '5' ;
SIX : '6' ;
SEVEN : '7' ;
EIGHT : '8' ;
NINE : '9' ;
Test method:
@Test
final void google() {
final String uri = "https://www.google.com/";
final UriLexer lexer = new UriLexer(new ANTLRInputStream(uri));
final UriParser parser = new UriParser(new CommonTokenStream(lexer));
parser.addErrorListener(new BaseErrorListener() {
@Override
public void syntaxError(final Recognizer<?, ?> recognizer, final Object offendingSymbol, final int line, final int charPositionInLine, final String msg, final RecognitionException e) {
throw new IllegalStateException("[" + line + ":" + charPositionInLine + "] Symbol [" + offendingSymbol + "] produced error: " + msg + ".", e);
}
});
Assertions.assertDoesNotThrow(parser::uri);
}
I get the following errors when I input https://www.google.com/
.
I have absolute no idea what is causing these parsing errors. Does anyone have an idea?
Output:
line 1:0 token recognition error at: 'h'
line 1:1 token recognition error at: 't'
line 1:2 token recognition error at: 't'
line 1:3 token recognition error at: 'p'
line 1:4 token recognition error at: 's'
line 1:5 missing '6' at ':'