0

I have the following config file that I am trying to parse.

[ main ]
e_type=0x1B
username="username"
appname="applicationname"

In the lex file (test.l) specified below,the regular expression for STR is \"[^\"]*\" so that it recognizes everything within quotes.When I access the value of "username" or "applicationname" inside the parser file using $N variable, it contains the literal string.I just want username and applicationname i.e without string quotation marks.

Is there a standard way to acheive this.

I have the following lex file (test.l)

%option noyywrap
%option yylineno
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "y.tab.h"

int yylinenu = 1;
int yycolno=1;

/**
 * Forward declerations
 **/
void Number ();
void HexaNumber ();
unsigned char getHexaLex (char c);
unsigned int strtol16 (char * str);


%}

%option nounput
%option noinput
%option case-insensitive

/*-----------------------------------------------------------------
   Some macros (standard regular expressions)
------------------------------------------------------------------*/

DIGIT       [0-9]
HEXALETTER  [a-fA-F]
HEXANUMBER  [0][x](({DIGIT}|{HEXALETTER})+)
NUM         {DIGIT}+
HEXA        ({DIGIT}|{HEXALETTER}|[*])
STR         \"[^\"]*\"
WSPACE      [ \t]*
NEWLINE     [\n\r]        

/*----------------------------------------------------------------
   The lexer rules
------------------------------------------------------------------*/
%%

e_type                   { yylval.str = yytext; return T_E_TYPE; }
main                     { yylval.str = yytext; return T_MAIN_SECTION;}
{HEXANUMBER}             { yylval.n = atoi(yytext);  HexaNumber(); return T_NUMBER; }
=                        { return T_EQUAL; }
"["                      { return T_OPEN_BRACKET; }
"]"                      { return T_CLOSE_BRACKET;}
appname                  { Custom_tag(); return T_APPNAME; }
username                 { Custom_tag(); return T_APPNAME; }

[^\t\n\r]                { }
{WSPACE}                 { } /* whitespace: (do nothing) */
{NEWLINE}                {  
                            yylinenu++;
                            return T_EOL;
                         }
{STR}                    { Generic_string(); return T_STRING;}                      

%%

void Number () {
    yylval.n = atol(yytext);
}

void Generic_string() {
    yylval.str = malloc(strlen(yytext)+1);
    strcpy (yylval.str, yytext);
}
Brian Tompsett - 汤莱恩
  • 5,753
  • 72
  • 57
  • 129
liv2hak
  • 14,472
  • 53
  • 157
  • 270
  • You're returning `yytext` directly in `yylval.str`, which is wrong -- the token buffer will change for the next token read, causing your symbols to seemingly randomly get munged. You need to make a copy of the `yytext` and return a pointer to that. – Chris Dodd Oct 07 '15 at 18:56

1 Answers1

1

You have a pointer to the matched token (yytext) and its length (yyleng), so it is quite simple to remove the quotes:

void Generic_string() {
    yylval.str = malloc(yyleng - 1);  // length - 2 (quotes) + 1 (NUL)
    memcpy (yylval.str, yytext + 1, yyleng - 2); // copy all but quotes
    yylval.str[yyleng - 2] = 0;                  // NUL-terminate
}

Personally, I'd suggest avoiding use of global variables in Generic_string, both to simplify future implementation of a reentrant scanner, and to make the process a bit more flexible:

{STR}   { yylval.str = duplicate_segment(yytext + 1, yyleng - 2);
          return T_STRING;
        }

  /* ... */

char* duplicate_segment(const char* token, int token_length) {
  char* dup = malloc(token_length + 1);
  if (!dup) { /* handle memory allocation error */ }
  memcpy(dup, token, token_length);
  dup[token_length] = 0;
  return dup;
}
rici
  • 234,347
  • 28
  • 237
  • 341