16

Is it possible to make YACC (or I'm my case MPPG) output an Abstract Syntax Tree (AST).

All the stuff I'm reading suggests its simple to make YACC do this, but I'm struggling to see how you know when to move up a node in the tree as your building it.

Sprotty
  • 5,676
  • 3
  • 33
  • 52

3 Answers3

9

Expanding on Hao's point and from the manual, you want to do something like the following:

Assuming you have your abstract syntax tree with function node which creates an object in the tree:

expr : expr '+' expr
  {  
     $$ = node( '+', $1, $3 );  
  }

This code translates to "When parsing expression with a plus, take the left and right descendants $1/$3 and use them as arguments to node. Save the output to $$ (the return value) of the expression.

$$ (from the manual):

To return a value, the action normally sets the pseudovariable ``$$'' to some value.

AlexeyMK
  • 6,245
  • 9
  • 36
  • 41
6

Have you looked at the manual (search for "parse tree" to find the spot)? It suggests putting node creation in an action with your left and right descendants being $1 and $3, or whatever they may be. In this case, yacc would be moving up the tree on your behalf rather than your doing it manually.

hao
  • 10,138
  • 1
  • 35
  • 50
  • 1
    Thanks, I'd seen this before in the lexx & yacc book. But i'd kind of written it off as a dead end. To get it to all hang together you need to modify the LexType which is define in the union tag %union { private State _state; ... LexValue(State state, object child1, object child2) {...} } This allows you to store a tree node as your state. You can then assign data to it using the $$ alias $$ = new LexValue(State.SchemaImport, $3, $5); Note: The lexer needs to push its token data into this structure as well. Easy when you know how... – Sprotty Jun 04 '09 at 19:17
2

The other answers propose to modify the grammar, this isn't doable when playing with a C++ grammer (several hundred of rules..)

Fortunately, we can do it automatically, by redefining the debug macros. In this code, we are redefining YY_SYMBOL_PRINT actived with YYDEBUG :

%{

typedef struct tree_t {
    struct tree_t **links;
    int nb_links;
    char* type; // the grammar rule
};

#define YYDEBUG 1
//int yydebug = 1;

tree_t *_C_treeRoot;
%}
%union tree_t

%start program

%token IDENTIFIER
%token CONSTANT

%left '+' '-'
%left '*' '/'
%right '^'

%%
progam: exprs { _C_treeRoot = &$1.t; }
    |
    | hack
    ;

exprs:
    expr ';'
    | exprs expr ';'
    ;


number:
    IDENTIFIER
    | '-' IDENTIFIER
    | CONSTANT
    | '-' CONSTANT
    ;

expr:
    number
    | '(' expr ')'
    | expr '+' expr
    | expr '-' expr
    | expr '*' expr
    | expr '/' expr
    | expr '^' expr
    ;

hack:
    {
    // called at each reduction in YYDEBUG mode
    #undef YY_SYMBOL_PRINT
    #define YY_SYMBOL_PRINT(A,B,C,D) \
        do { \
            int n = yyr2[yyn]; \
            int i; \
            yyval.t.nb_links = n; \
            yyval.t.links = malloc(sizeof *yyval.t.links * yyval.t.nb_links);\
            yyval.t.str = NULL; \
            yyval.t.type = yytname[yyr1[yyn]]; \
            for (i = 0; i < n; i++) { \
              yyval.t.links[i] = malloc(sizeof (YYSTYPE)); \
              memcpy(yyval.t.links[i], &yyvsp[(i + 1) - n], sizeof(YYSTYPE)); \
            } \
        } while (0)

    }
    ;
%%

#include "lexer.c"


int yyerror(char *s) {
    printf("ERROR : %s [ligne %d]\n",s, num_ligne);
    return 0;
}


int doParse(char *buffer)
{
    mon_yybuffer = buffer;
    tmp_buffer_ptr = buffer;
    tree_t *_C_treeRoot = NULL;
    num_ligne = 1;
    mon_yyptr = 0;

    int ret = !yyparse();

    /////////****
             here access and print the tree from    _C_treeRoot 
    ***///////////
}


char *tokenStrings[300] = {NULL};
char *charTokenStrings[512];

void initYaccTokenStrings()
{
    int k;
    for (k = 0; k < 256; k++)
    {
        charTokenStrings[2*k] = (char)k;
        charTokenStrings[2*k+1] = 0;
        tokenStrings[k] = &charTokenStrings[2*k];
    }
    tokenStrings[CONSTANT] = "CONSTANT";
    tokenStrings[IDENTIFIER] = "IDENTIFIER";


    extern char space_string[256];

    for (k = 0; k < 256; k++)
    {
        space_string[k] = ' ';
    }
}

the leafs are created just before the RETURN in the FLEX lexer

reuns
  • 854
  • 10
  • 14