0

I'm working on a language interpreter for a programming language I made up. Here's some example code, which should work but currently dies with Syntax error at offset 45. when reading this testcase.

{
  foo = { "min" : 1 ,"max" : 5};
  foo["min"]
}

The correct interpretation is for the first line with foo to create a map and store it in a variable named foo, the second line looks up the value of the field min in the record foo, and the starting/ending curlies together with the semicolon wrap the two expressions into an expr_seq (i.e. a block) which evaluates to the same thing as the last expr in it.

A simplified version of my parser.mly is as follows:

%token <int> INT
%token <string> VAR
%token SEMI COMMA COLON ASSIGN QUOTE
%token LBRACK RBRACK LCURL RCURL
%token EOF

%start <int> main

%%

main:
| EOF
    { failwith "empty input" }
| e = exp EOF
    { e }

exp:
| INT
    { 0 }
| e = exp LBRACK v = q_var RBRACK
    { (* map lookup *) 0 }
| v = VAR ASSIGN e = exp
    { (* assign to var *) 0 }
| v = VAR LBRACK f = q_var RBRACK ASSIGN e = exp
    { (* assign to map field *) 0 }
| v = VAR
    { Printf.printf "lookup %s\n" v; 0 }
| LCURL e = expr_seq RCURL
    { (* Block expression *) 0 }
| LCURL f = fields RCURL
    { (* map literal *)0 }

fields:
| v = q_var COLON e = exp
    { [(v, e)] }
| v = q_var COLON e = exp COMMA vt = fields
    { (v, e) :: vt }

q_var:
| QUOTE v = VAR QUOTE
    { Printf.printf "qvar %s\n" v; v }

expr_seq:
| e = exp
    {[e]}
|e1 = exp SEMI e2 = expr_seq
    {e1 :: e2} 

Trying to debug it on my own, I found that if you removed the following | v = VAR LBRACK f = q_var RBRACK ASSIGN e = exp it will parse it and run correctly, but I'd really like to be able to set things in maps.

I'm 98% confident that the problem lies in my mly file, but a simplified version of my lexer.mll is as follows:

{
  open Parser
  open Printf
}

rule token = parse
| [' ' '\t' '\n']
    { token lexbuf }
| "="
    {ASSIGN}
| ['1'-'9']['0'-'9']* as i
    { INT (int_of_string i) }
| ['a'-'z']+ as v
    { printf "var %s\n" v;VAR v }
| '{'
    { LCURL }
| '}'
    { RCURL }
| '['
    { printf "["; LBRACK }
| ']'
    { printf "]"; RBRACK }
| ';'
    { SEMI }
| ':'
    { COLON }
| ','
    { COMMA }
| '"'
    { QUOTE }
| eof
    { EOF }
| _
    { raise (Failure (sprintf "At offset %d: unexpected character.\n" 
                       (Lexing.lexeme_start lexbuf))) }

And a simple ml file is:

open Core.Std
open Printf

let rec read_all ic =
  try
    let ln = input_line ic in
      ln ^ read_all ic
  with End_of_file -> "";;

let () =
  let linebuf = Lexing.from_string (read_all stdin) in
  try
    Parser.main Lexer.token linebuf;
    printf "Done"
  with
  | Failure msg ->
      fprintf stderr "%s%!" msg
  | Parser.Error ->
      fprintf stderr "Syntax error at offset %d.\n%!"   
        (Lexing.lexeme_start linebuf)

Edit: Here's a Makefile. parser.mly, lexer.mll, and interpreter.ml are the second, third, and fourth files above.

all: HB lexer.cmx parser.cmx interpreter.cmx
    @true

HB: interpreter.cmx
    ocamlfind ocamlopt -o HB -linkpkg -package core -package core_kernel \
    -thread -w -10 parser.cmx lexer.cmx interpreter.cmx

interpreter.cmx: lexer.cmx
    ocamlfind ocamlopt -package core -package core_kernel -thread -w   -10 \
    -c interpreter.ml

lexer.cmx: lexer.ml parser.cmx
    ocamlfind ocamlopt    -c lexer.ml

parser.cmx: parser.mly
    menhir --ocamlc "ocamlfind ocamlc" --infer --base parser  parser.mly
    ocamlfind ocamlc -c parser.mli
    ocamlfind ocamlopt -c parser.ml

lexer.ml: lexer.mll
    ocamllex lexer.mll

clean:
    @rm HB *.o *.cmi *.cmx lexer.ml parser.ml parser.mli 2>/dev/null || true

and here's making / running it, where test.in is the first one above.

$ mk;HB < test.in
ocamllex lexer.mll
menhir --ocamlc "ocamlfind ocamlc" --infer --base parser  parser.mly
15 states, 286 transitions, table size 1234 bytes
Warning: 3 states have shift/reduce conflicts.
Warning: 3 shift/reduce conflicts were arbitrarily resolved.
ocamlfind ocamlc -c parser.mli
ocamlfind ocamlopt -c parser.ml
ocamlfind ocamlopt    -c lexer.ml
ocamlfind ocamlopt -package core -package core_kernel -thread -w -10 \
    -c interpreter.ml
ocamlfind ocamlopt -o HB -linkpkg -package core -package core_kernel \
    -thread -w -10 parser.cmx lexer.cmx interpreter.cmx
Syntax error at offset 45.
var foo
var min
qvar min
var max
qvar max
var foo
[var min
]qvar min

Edit 2: I ended up just adding | e = VAR LBRACK v = q_var RBRACK { GetMap(v,LookupVar(e)) } as a special case to my parser. So, problem solved?

Joshua Snider
  • 705
  • 1
  • 8
  • 34
  • 2
    I don't feel like I can help you because your example is too large to understand in the time I have, while simultaneously being unusable as actual source code that I could run through the toolchain. Best would be to get your problem stripped down to something very small. Second best would be to give actual code that shows your problem. – Jeffrey Scofield Apr 23 '15 at 04:37
  • 3
    Usualy a good way to understand problems with grammars is to pass it to `menhir --interpreter` and look at the derivations. – Pierre Chambart Apr 23 '15 at 09:37
  • @JeffreyScofield: Added a Makefile and explained how to run it. The code above is about as stripped down as I can get it, the testcase is parsed corectly if you remove the `(* assign to map field *)` and breaks further if you remove the rest. – Joshua Snider Apr 23 '15 at 21:03
  • @PierreChambart: Typing `LCURL VAR ASSIGN LCURL QUOTE VAR QUOTE COLON INT COMMA QUOTE VAR QUOTE COLON INT RCURL SEMI VAR LBRACK QUOTE VAR QUOTE RBRACK RCURL` which is the correct interpretation of the test case into `menhir --interpret` prints `ACCEPT`. Is there a way to get menhir to vomit out what tokens it had already matched when it dies? – Joshua Snider Apr 23 '15 at 21:29
  • Adding a bunch of print statements makes it print out `Syntax error at offset 45. LCURL VAR ASSIGN LCURL QUOTE VAR QUOTE COLON INT COMMA QUOTE VAR QUOTE COLON INT RCURL SEMI VAR LBRACK QUOTE VAR QUOTE RBRACK RCURL`, which should be accepted since it's the same as what I typed into `menhir --interpret`. – Joshua Snider Apr 23 '15 at 22:15

2 Answers2

1

I've tried to play with your language, and now I agree with a parser, your input is bad, look at your "assign to map field" rule:

v = VAR LBRACK f = q_var RBRACK ASSIGN e = exp

if we remove this noisy variables (that you don't need to use, btw):

VAR LBRACK q_var RBRACK ASSIGN exp

that means that rule expects:

VAR, '[' '"' VAR '"' ']' '=' exp

for example

foo["min"] = 42

the following is perfectly accepted

{
  foo = { "min" : 1 ,"max" : 5};
  foo["min"] = 42
}
ivg
  • 34,431
  • 2
  • 35
  • 63
  • Tried your suggestion, but it didn't change the output. – Joshua Snider Apr 23 '15 at 20:40
  • Ok, looks like you parser is correct, your input wasn't well formed – ivg Apr 24 '15 at 01:15
  • The rule I'm trying to match with the `foo["min"]` is `| e = exp LBRACK v = q_var RBRACK { (* map lookup *) 0 }` which is supposed to work the same way as it would for python dicts, where it looks up the value in the map. – Joshua Snider Apr 24 '15 at 02:55
  • The `["min"]` part matches the `LBRACK q_var RBRACK` in the trivial way, the foo matches the `v = VAR` rule and then is the `e = exp` at the beginning. If you remove the "assign to map field" rule, my parser matches it without complaint. – Joshua Snider Apr 24 '15 at 03:05
0

Looking back at this, I believe the reason it doesn't work is that the grammar is not LR(1) and therefore cannot be parsed accurately by Menhir. Determining if foo["min"] is the start of v = VAR LBRACK f = q_var RBRACK ASSIGN e = exp or is e = exp LBRACK v = q_var RBRACK, requires us to lookahead four or so symbols and Menhir as an LR(1) parser only looks ahead one.

Joshua Snider
  • 705
  • 1
  • 8
  • 34