0

I am trying to make a transcompiler with Bison and Flex. This is my first time using these tools. My goal is to convert a very basic code from a simple custom language called "facile" to CIL. The problem I am facing is that when I run my transcompiler for the following test file, it throws me the error: Line 1: syntax error, unexpected end of file, expecting identifier.

Here is the test file in "facile" language:

read a;
read b;
c := a+b;
print c;

Also, my transcompiler shows a segfault when I run it for an empty file.

Here's the language description:

Language description

Flex file facile.lex :

%{
    #include <glib.h>
    #include <assert.h>
    
    #include "facile.y.h"

%}

%option yylineno

%%

if {
    assert(printf("'if' found"));
    return TOK_IF;
}

then {
    assert(printf("'then' found"));
    return TOK_THEN;
}

elsif {
    assert(printf("'elsif' found"));
    return TOK_ELSIF;
}

else {
    assert(printf("'else' found"));
    return TOK_ELSE;
}

read {
    assert(printf("'read' found"));
    return TOK_READ;
}

print {
    assert(printf("'print' found"));
    return TOK_PRINT;
}

while {
    assert(printf("'while' found"));
    return TOK_WHILE;
}

do {
    assert(printf("'do' found"));
    return TOK_DO;
}

end {
    assert(printf("'end' found"));
    return TOK_END;
}

endwhile {
    assert(printf("'endwhile' found"));
    return TOK_ENDWHILE;
}

continue {
    assert(printf("'continue' found"));
    return TOK_CONTINUE;
}

break {
    assert(printf("'break' found"));
    return TOK_BREAK;
}

";" {
    assert(printf("';' found"));
    return TOK_SEMI_COLON;
}

":=" {
    assert(printf("':=' found"));
    return TOK_AFFECTATION;
}

"+" {
    assert(printf("'+' found"));
    return TOK_ADD;
}

"-" {
    assert(printf("'-' found"));
    return TOK_SUB;
}

"*" {
    assert(printf("'*' found"));
    return TOK_MUL;
}

"/" {
    assert(printf("'/' found"));
    return TOK_DIV;
}

"(" {
    assert(printf("'(' found"));
    return TOK_OPEN_PAR;
}

")" {
    assert(printf("')' found"));
    return TOK_CLOSE_PAR;
}

true {
    assert(printf("'true' found"));
    return TOK_TRUE;
}

false {
    assert(printf("'false' found"));
    return TOK_FALSE;
}

">=" {
    assert(printf("'>=' found"));
    return TOK_SUP_EQ;
}

"<=" {
    assert(printf("'<=' found"));
    return TOK_INF_EQ;
}

">" {
    assert(printf("'>' found"));
    return TOK_SUP;
}

"<" {
    assert(printf("'<' found"));
    return TOK_INF;
}

"=" {
    assert(printf("'=' found"));
    return TOK_EQ;
}

"#" {
    assert(printf("'#' found"));
    return TOK_DIFF;
}

not {
    assert(printf("'not' found"));
    return TOK_NOT;
}

and {
    assert(printf("'and' found"));
    return TOK_AND;
}

or {
    assert(printf("'or' found"));
    return TOK_OR;
}

[0-9]+ {
    assert(printf("number '%s(%d)' found", yytext, yyleng));
    sscanf(yytext, "%lu", &yylval.number);
    return TOK_NUMBER;
}

[a-zA-Z][a-zA-Z0-9_]* {
    assert(printf("indentifier '%s(%d)' found", yytext, yyleng));
    yylval.string = strdup(yytext);
    return TOK_IDENTIFIER;
}

[ \t\r\n]*;

. {
    return 0;
}

%%

/*
 * file: facile.lex
 * version: 0.8.0
 */

Bison file facile.y :

%{
    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include <ctype.h>
    #include <glib.h>

    extern int yylex(void);
    extern int yyerror(const char *msg);
    extern int yylineno;
    extern FILE *stdin;
    GHashTable *table;

    void begin_code();
    void produce_code(GNode *node);
    void end_code();

%}

%union {
    gulong number;
    gchar *string;
    GNode *node;
}

%define parse.error verbose

%token<number>  TOK_NUMBER           "number"
%token<string>  TOK_IDENTIFIER       "identifier"
%token          TOK_IF               "if"
%token          TOK_THEN             "then"
%token          TOK_SEMI_COLON       ";"
%token          TOK_AFFECTATION      ":="
%left           TOK_ADD              "+"
%left           TOK_SUB              "-"
%left           TOK_MUL              "*"
%left           TOK_DIV              "/"
%left           TOK_SUP_EQ           ">="
%left           TOK_INF_EQ           "<="
%left           TOK_SUP              ">"
%left           TOK_INF              "<"
%left           TOK_EQ               "="
%left           TOK_DIFF             "#"
%left           TOK_NOT              "not"
%left           TOK_AND              "and"
%left           TOK_OR               "or"
%token          TOK_ELSIF            "elsif"
%token          TOK_ELSE             "else"
%token          TOK_READ             "read"
%token          TOK_PRINT            "print"
%token          TOK_WHILE            "while"
%token          TOK_DO               "do"
%token          TOK_END              "end"
%token          TOK_ENDWHILE         "endwhile"
%token          TOK_CONTINUE         "continue"
%token          TOK_BREAK            "break"
%token          TOK_OPEN_PAR         "("
%token          TOK_CLOSE_PAR        ")"
%token          TOK_TRUE             "true"
%token          TOK_FALSE            "false"

%type<node>     code
%type<node>     expression
%type<node>     instruction
%type<node>     identifier
%type<node>     print
%type<node>     read
%type<node>     affectation
%type<node>     number
%type<node>     boolean
%type<node>     elsif
%type<node>     else
%type<node>     if
%type<node>     code_while
%type<node>     while
%type<node>     program

%%

program:
    code {
        begin_code();
        produce_code($1);
        end_code();
        g_node_destroy($1);
    };

code:
    code instruction{
        $$ = g_node_new("code");
        g_node_append($$, $1);
        g_node_append($$, $2);
    }
    |
    {
        $$ = g_node_new("");
    } ;

instruction:
    read |
    print |
    affectation |
    if |
    while ;

read:
    TOK_READ identifier TOK_SEMI_COLON
    {
        $$ = g_node_new("read");
        g_node_append($$, $2);
    };

print:
    TOK_PRINT expression TOK_SEMI_COLON
    {
        $$ = g_node_new("print");
        g_node_append($$, $2);
    };

affectation:
    identifier TOK_AFFECTATION expression TOK_SEMI_COLON
    {
        $$ = g_node_new("affectation");
        g_node_append($$, $1);
        g_node_append($$, $3);
    };

boolean:
    TOK_TRUE |
    TOK_FALSE |
    expression TOK_SUP_EQ expression |
    expression TOK_INF_EQ expression |
    expression TOK_SUP expression |
    expression TOK_INF expression |
    expression TOK_EQ expression |
    expression TOK_DIFF expression |
    TOK_NOT boolean |
    boolean TOK_AND boolean |
    boolean TOK_OR boolean |
    TOK_OPEN_PAR boolean TOK_CLOSE_PAR;

elsif:  TOK_ELSIF boolean TOK_THEN code |
        TOK_ELSIF boolean TOK_THEN code elsif;

else:
    TOK_ELSE code;

if:
    TOK_IF boolean TOK_THEN code TOK_END |
    TOK_IF boolean TOK_THEN code else TOK_END |
    TOK_IF boolean TOK_THEN code elsif TOK_END |
    TOK_IF boolean TOK_THEN code elsif else TOK_END;

code_while:
    instruction |
    TOK_CONTINUE |
    TOK_BREAK;

while:  TOK_WHILE boolean TOK_DO code_while TOK_ENDWHILE |
        TOK_WHILE boolean TOK_DO code_while TOK_END;

expression : 
    identifier |
    number |
    expression TOK_ADD expression
    {
        $$ = g_node_new("add");
        g_node_append($$, $1);
        g_node_append($$, $3);
    }
    |
    expression TOK_SUB expression
    {
        $$ = g_node_new("sub");
        g_node_append($$, $1);
        g_node_append($$, $3);
    }
    |
    expression TOK_MUL expression
    {
        $$ = g_node_new("mul");
        g_node_append($$, $1);
        g_node_append($$, $3);
    }
    |
    expression TOK_DIV expression
    {
        $$ = g_node_new("div");
        g_node_append($$, $1);
        g_node_append($$, $3);
    }
    |
    TOK_OPEN_PAR expression TOK_CLOSE_PAR{
        $$ = $2;
    };
    
identifier: 
    TOK_IDENTIFIER
    {
        $$ = g_node_new("identifier");
        gulong value = (gulong) g_hash_table_lookup(table, $1);
        if (!value) {
            value = g_hash_table_size(table) + 1;
            g_hash_table_insert(table, strdup($1), (gpointer) value);
        }
        g_node_append_data($$, (gpointer)value);
    };

number: 
    TOK_NUMBER
    {
        $$ = g_node_new("number");
        g_node_append_data($$, (gpointer)$1);
    };

%%

/*
 * file: facile.y
 * version: 0.8.0
 */

int yyerror(const char *msg) {
    fprintf(stderr, "Line %d: %s\n", yylineno, msg);
}


void begin_code()
{
    FILE *stream;
    char *module_name;
    int max_stack;

    fprintf(stream,
        ".assembly %s {}\n"
        ".method public static void Main() cil managed\n"
        "{\n"      
        "   .entrypoint\n"
        "   .maxstack %u\n"
        "   .locals init (",
        module_name,
        max_stack
    );
    guint size = g_hash_table_size(table);
    guint i;
    for (i = 0; i < size; i++) {
        if (i) {
            fprintf(stream, ", ");
        }
        fprintf(stream, "int32");
    }
    fprintf(stream, ")\n");
}

int main(int argc, char *argv[]) {
    if(argc == 2){
        char *file_name_input = argv[1];
        char *extension;
        char *directory_limiter;
        char *basename;
        FILE *stream;
        char *module_name;

        extension = rindex(file_name_input, '.');
        if(!extension || strcmp(extension, ".facile") != 0) {
            fprintf(stderr, "Input filename extension must be '.facile'\n");
            return EXIT_FAILURE;
        }
        directory_limiter = rindex(file_name_input, '/');
        if (!directory_limiter) {
            directory_limiter = rindex(file_name_input, '\\');
        }
        if (directory_limiter) {
            basename = strdup(directory_limiter + 1);
        } else {
            basename = strdup(file_name_input);
        }
        module_name = strdup(basename);
        *rindex(module_name, '.') = '\0';
        strcpy(rindex(basename, '.'), ".il");
        char *onechar = module_name;
        if (!isalpha(*onechar) && *onechar != '_'){
            free(basename);
            fprintf(stderr, "Base input filename must start with a letter or an underscore");
            return EXIT_FAILURE;
        }
        onechar++;
        while (*onechar) {
            if (!isalnum(*onechar) && *onechar != '_') {
                free(basename);
                fprintf(stderr, "Base input filename cannot contain special characters");
                return EXIT_FAILURE;
            }
            onechar++;
        }
        if (stdin = fopen(file_name_input, "r")) {
            if(stream = fopen(basename, "w")){
                table = g_hash_table_new_full(g_str_hash, g_str_equal, free, NULL);
                yyparse();
                g_hash_table_destroy(table);
                fclose(stream);
                fclose(stdin);
            } else {
                free(basename);
                fclose(stdin);
                fprintf(stderr, "Output filename cannot be opened\n");
                return EXIT_FAILURE;
            }
        } else {
            free(basename);
            fprintf(stderr, "Input filename cannot be opened\n");
            return EXIT_FAILURE;
        }
        free(basename); 
    } else {
        fprintf(stderr, "No input filename given\n");
        return EXIT_FAILURE;
    }
    return EXIT_SUCCESS;
}

void produce_code(GNode *node) {
    FILE *stream;
    char *module_name;

    if (node->data == "code"){
        produce_code(g_node_nth_child(node, 0));
        produce_code(g_node_nth_child(node, 1));
    } else if (node->data == "affectation") {
        produce_code(g_node_nth_child(node, 1));
        fprintf(stream, "stloc\t%ld\n", (long)g_node_nth_child(g_node_nth_child(node, 0), 0)->data - 1);
    } else if (node->data == "add") {
        produce_code(g_node_nth_child(node, 0));
        produce_code(g_node_nth_child(node, 1));
        fprintf(stream, "add\n");
    } else if (node->data == "sub"){
        produce_code(g_node_nth_child(node, 0));
        produce_code(g_node_nth_child(node, 1));
        fprintf(stream, "sub\n");
    } else if (node->data == "mul"){
        produce_code(g_node_nth_child(node, 0));
        produce_code(g_node_nth_child(node, 1));
        fprintf(stream, "mul\n");
    } else if (node->data == "div"){
        produce_code(g_node_nth_child(node, 0));
        produce_code(g_node_nth_child(node, 1));
        fprintf(stream, "div\n");
    } else if (node->data == "number") {
        fprintf(stream, "ldc.i4\t%ld\n", (long)g_node_nth_child(node, 0)->data);
    } else if (node->data == "identifier"){
        fprintf(stream, "ldloc\t%ld\n", (long)g_node_nth_child(node, 0)->data - 1);
    } else if (node->data == "print"){
        produce_code(g_node_nth_child(node, 0));
        fprintf(stream, "call void class [mscorlib]System.Console::WriteLine(int32)\n");
    } else if (node->data == "read") {
        fprintf(stream, "call string class [mscorlib]System.Console::ReadLine()\n");
        fprintf(stream, "call int32 class [mscorlib]System.Int32::Parse(string)\n");
        fprintf(stream, "stloc\t%ld\n", (long)g_node_nth_child(g_node_nth_child(node, 0), 0)->data - 1);
    }
}

void end_code()
{
    FILE *stream;
    fprintf(stream, "   ret\n}\n");
}

I don't know what to do, I tried simplifying the language just to test but I couldn't manage to make it work.

Your help would be very appreciated!

Thanks!

Piotr Siupa
  • 3,929
  • 2
  • 29
  • 65
Luidéo
  • 35
  • 1
  • 4
  • The error message says that the error occurred in the first line so either you're not counting the lines properly or you're not using the input you said you are. You should fix that first so diagnosing the main error will be easier. Edit: Maybe the problem is something like this? https://stackoverflow.com/q/31524630/3052438 – Piotr Siupa Mar 31 '23 at 10:54
  • 2
    The problem may be what your rule for unmatched character does. It is customary to just return the character, like this: `. { return yytext[0]; }` but instead you have `. { return 0; }`. Parser interprets `0` as EOF so each time the lexer finds something it cannot match, the parser will produce an error similar to the one you got. – Piotr Siupa Mar 31 '23 at 11:04
  • 1
    https://stackoverflow.com/questions/50821203/how-can-i-debug-my-flex-bison-grammar – Piotr Siupa Mar 31 '23 at 11:05

0 Answers0