You could do this in the lexer by adding some custom code and using the more advanced features:
The gist of it is this:
- create some domain models that represent your SQL dialects containing the data about what escape/block chars are
- when you encounter a
#name(
in the lexer, enter into a "native" mode
- inside the "native" mode, check if you need to enter other modes (like a "block" mode or "quote" mode) or if you need to return to the default mode (needing to pop out of the "native" mode)
A small Java demo:
// DynamicLexer.g4
lexer grammar DynamicLexer;
@members {
private java.util.Map<String, SqlDialect> dialects;
private String dialect = null;
private Block block = null;
private Quote quote = null;
public DynamicLexer(java.util.Map<String, SqlDialect> dialects, CharStream input) {
this(input);
this.dialects = dialects;
}
private void setDialect(String token) {
this.dialect = token.replaceAll("[#(]", "");
}
private SqlDialect getDialect() {
SqlDialect sqlDialect = this.dialects.get(this.dialect);
if (sqlDialect == null) {
throw new RuntimeException("Unknown dialect: '" + this.dialect + "'");
}
return sqlDialect;
}
private boolean blockStartAhead() {
SqlDialect sqlDialect = this.getDialect();
for (Block b : sqlDialect.blocks) {
if (this.ahead(b.start)) {
this.consume(b.start);
this.block = b;
return true;
}
}
return false;
}
private boolean blockEndAhead() {
if (this.ahead(this.block.end)) {
this.consume(this.block.end);
return true;
}
return false;
}
private boolean quoteStartAhead() {
SqlDialect sqlDialect = this.getDialect();
for (Quote q : sqlDialect.quotes) {
if (this.ahead(q.start)) {
this.consume(q.start);
this.quote = q;
return true;
}
}
return false;
}
private boolean quoteEndAhead() {
if (this.ahead(this.quote.start)) {
this.consume(this.quote.start);
return true;
}
return false;
}
private boolean quoteEscapeAhead(boolean consume) {
if (this.ahead(this.quote.escape)) {
if (consume) {
this.consume(this.quote.escape);
}
return true;
}
return false;
}
private boolean ahead(String text) {
for (int i = 1; i <= text.length(); i++) {
if (this._input.LA(i) != text.charAt(i - 1)) {
return false;
}
}
return true;
}
private void consume(String text) {
for (int i = 1; i < text.length(); i++) {
this._input.consume();
}
}
}
SPACE : [ \t\r\n] -> skip;
EQUAL : '=';
ADD : '+';
INT : [0-9]+;
NATIVE : '#' [a-zA-Z]+ '(' {setDialect(getText());} -> pushMode(NATIVE_MODE);
mode NATIVE_MODE;
BLOCK_START : {blockStartAhead()}? . -> pushMode(BLOCK_MODE);
QUOTE_START : {quoteStartAhead()}? . -> pushMode(QUOTE_MODE);
LPAR : ')' -> popMode;
RPAR : '(' -> pushMode(NATIVE_MODE);
NATIVE_ATOM : [a-zA-Z0-9]+ | ~[a-zA-Z0-9];
mode BLOCK_MODE;
BLOCK_END : {blockEndAhead()}? . -> popMode;
BLOCK_ATOM : . ;
mode QUOTE_MODE;
ESCAPE : {quoteEscapeAhead(true)}? .;
QUOTE_END : {!quoteEscapeAhead(false) && quoteEndAhead()}? . -> popMode;
QUOTE_ATOM : .;
The lexer above can be used by the parser:
// DynamicParser.g4
parser grammar DynamicParser;
options {
tokenVocab=DynamicLexer;
}
parse
: EQUAL expr EOF
;
expr
: expr ADD expr
| native
| INT
;
native
: NATIVE native_atom* LPAR
;
native_atom
: NATIVE_ATOM
| LPAR
| RPAR
| native_block
| native_quote
;
native_block
: BLOCK_START BLOCK_ATOM* BLOCK_END
;
native_quote
: QUOTE_START ( ESCAPE | QUOTE_ATOM )* QUOTE_END
;
After generating the lexer and parser classes, test it with the following class:
public class Main {
public static void main(String[] args) {
Map<String, SqlDialect> dialects = new HashMap<>(){{
put("postgres", new SqlDialect("--",
new Block[]{ new Block("/*", "*/") },
new Quote[]{ new Quote("'", "''"), new Quote("\"", "\"\"") }));
put("mysql", new SqlDialect("#",
new Block[]{ new Block("/*", "*/") },
new Quote[]{ new Quote("'", "\\'"), new Quote("\"", "\\\""), new Quote("`", "```") }));
}};
String source = "=1 + #postgres(SELECT (ARRAY[(2),'-3' /*)(*/, (((3)))])[1])";
DynamicLexer lexer = new DynamicLexer(dialects, CharStreams.fromString(source));
CommonTokenStream stream = new CommonTokenStream(lexer);
stream.fill();
for (Token t : stream.getTokens()) {
System.out.printf("%-20s '%s'%n",
DynamicLexer.VOCABULARY.getSymbolicName(t.getType()),
t.getText().replace("\n", "\\n"));
}
lexer = new DynamicLexer(dialects, CharStreams.fromString(source));
DynamicParser parser = new DynamicParser(new CommonTokenStream(lexer));
ParseTree root = parser.parse();
System.out.println(root.toStringTree(parser));
}
}
class SqlDialect {
public final String commentStart;
public final List<Block> blocks;
public final List<Quote> quotes;
public SqlDialect(String commentStart, Block[] blocks, Quote[] quotes) {
this.commentStart = commentStart;
this.blocks = Arrays.asList(blocks);
this.quotes = Arrays.asList(quotes);
}
}
class Block {
public final String start;
public final String end;
public Block(String start, String end) {
this.start = start;
this.end = end;
}
}
class Quote {
public final String start;
public final String escape;
public Quote(String start, String escape) {
this.start = start;
this.escape = escape;
}
}
After running the Main
class, you will see the following being printed to your console:
EQUAL '='
INT '1'
ADD '+'
NATIVE '#postgres('
NATIVE_ATOM 'SELECT'
NATIVE_ATOM ' '
RPAR '('
NATIVE_ATOM 'ARRAY'
NATIVE_ATOM '['
RPAR '('
NATIVE_ATOM '2'
LPAR ')'
NATIVE_ATOM ','
QUOTE_START '''
QUOTE_ATOM '-'
QUOTE_ATOM '3'
QUOTE_END '''
NATIVE_ATOM ' '
BLOCK_START '/*'
BLOCK_ATOM ')'
BLOCK_ATOM '('
BLOCK_END '*/'
NATIVE_ATOM ','
NATIVE_ATOM ' '
RPAR '('
RPAR '('
RPAR '('
NATIVE_ATOM '3'
LPAR ')'
LPAR ')'
LPAR ')'
NATIVE_ATOM ']'
LPAR ')'
NATIVE_ATOM '['
NATIVE_ATOM '1'
NATIVE_ATOM ']'
LPAR ')'
EOF '<EOF>'
(parse =
(expr
(expr 1)
+
(expr
(native #postgres(
(native_atom SELECT)
(native_atom )
(native_atom ()
(native_atom ARRAY)
(native_atom [)
(native_atom ()
(native_atom 2)
(native_atom ))
(native_atom ,)
(native_atom
(native_quote ' - 3 '))
(native_atom )
(native_atom
(native_block /* ) ( */))
(native_atom ,)
(native_atom )
(native_atom ()
(native_atom ()
(native_atom ()
(native_atom 3)
(native_atom ))
(native_atom ))
(native_atom ))
(native_atom ])
(native_atom ))
(native_atom [)
(native_atom 1)
(native_atom ]) ))))
<EOF>)
(I manually indented the parse tree, it will be a single line when you run the Java code)