0

I'm creating a program to separate nonterminals from terminals for a programming language. An enumerator keeps track of the tag names for the nonterminals. The nonterminal ID is set as 0 and all others enumerate after (as it should). However, when the program finds the nonterminals, it prints a bizarre value of 1953167781, though some do come out as 0. The ones that seem to come out as 1953... are ones that are ordered like so:open int num = 7, num being the nonterminal. num is not seen as 0 but as 1953167781. Though, if it's a declaration like: guarded class test{} the nonterminal test is 0, rather than 1953167781. Is there a possible reason for this phenomena? Thank you in advance for your feedback! Below are a picture of the output and an example of the process that take place.

enter image description here

EDIT

For example let's create headers to hold our declarations:

#ifndef __TOK_H__
#define __TOK_H__

#define MAX (1024)

typedef enum {
    ID = 0,OPEN,CLOSED,GUARDED,ARTIFICIAL,STATIC,GLOBAL,CONT,SUPER,INT,FLOAT,CHAR,STRING,
    BOOL,COLLECTION,CLASS,FUNCT,METHOD,STRUCT,ENUM,IF,ELSE,_OR_,DO,UNTIL,UNLESS,FOR,
    FOREACH,IN,TRY,CATCH,EXCEPTION,RETURN,SKIP,BREAK,TERM,NEW,CALL,TRU,FALS,NIL,
    INUMBER,FNUMBER,VCHAR,VSTRING,UNDEF,LT,GT,LE,GE,EQ,NE,AND,OR,ADD,ADDE,INCRE,SUB,SUBE,
    DECRE,MULT,MULTE,DIV,DIVE,MOD,MODE,EX,EXE,INC,ASI,NOT,INHER,DOT,COM,LP,RP,LB,RB,LBR,
    RBR,APO,QUO,SEMCO,EOFT,NAT
} tokentype;

typedef struct {
    char str[MAX];
    tokentype type;
} token;

void sscan(FILE *);
token generate(FILE *fp);

#endif

This one too:

#ifndef __RESERVED_H__
#define __RESERVED_H__

char *keywords[] = { //40 keywords/reserved words
    "open","closed","guarded","artificial","static","global","cont","super",
    "int","float","char","string","bool","collection","class","funct","method",
    "struct","enum","if","else","or","do","until","unless","for","foreach","in",
    "try","catch","exception","return","skip","break","term","new","call","true",
    "false","null",
};
char *relative_operators[] = { //8 relops
    "<",">","<=",">=","==","!=","&&","||",
};
char operators[6] = { //6 operators
    "+-*/%^",
};
char delimeters[11] = { //11 delimeters
    ".,(){}[];'\"",
};
char unique_operators[] = { //5 unops
    "#!&|="
};

#endif

And then the (while not efficient in any way) scanner and separator:

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "reserved.h"
#include "tok.h"

int linenum = 1;
char ch;
token TOKEN;

int isLangPunct(char ch);
int isKEY(char *word);
int isDELIM(char ch);
int isRELOP(char ch);
int isUNIQUE(char ch);
int isOPERA(char ch);
tokentype getKey(char *word);
tokentype getDel(char ch);

void sscan(FILE *fp) {
    while ((ch = fgetc(fp)) != EOF) {
        if (ch == '\n') {
            linenum++;
            continue;
        } else if (isLangPunct(ch) || isalnum(ch) || isspace(ch)) {
            continue;
        } else {
            fprintf(stderr, "Undefined character: %c at line %d.\n", ch, linenum);
        }
    }
    rewind(fp); linenum = 1;
    do {
        generate(fp);
    } while (TOKEN.type != EOFT);
}
int isLangPunct(char ch) {
    int r = 0;
    if (ch == '#' || ch == '|' || ch == '&' || ch == '=' || ch == '%' ||
        ch == '!' || ch == '+' || ch == '-' || ch == '*' || ch == '/' ||
        ch == '.' || ch == ',' || ch == '(' || ch == ')' || ch == ';' ||
        ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == '\'' ||
        ch == '"' || ch == '<' || ch == '>' || ch == '^') 
    {
        r = 1;
    } return r;
}
token generate(FILE *fp) {
    char *word = malloc(sizeof(char) * MAX);
    char *number = malloc(sizeof(char) * MAX);
    int wi = 0, ni = 0;
    while((ch = fgetc(fp)) != EOF) {
        if (ch == '\n') {
            linenum++; continue;
        } else if (ch == '\t' || ch == ' ' || ch == '\r') {
            continue;
        } else if (isalpha(ch)) {
            do {
                word[wi++] = ch;
            } while(isalpha(ch = fgetc(fp)));
            word[wi] = '\0';
            wi = 0;
            strcpy(TOKEN.str, word);
            if (isKEY(word)) {
                TOKEN.type = getKey(word);
            } else {
                TOKEN.type = ID;
            }
            fseek(fp, -1, SEEK_CUR);
            printf("%d ", (int)TOKEN.type);
            return TOKEN;
        } else if (isdigit(ch)) {
            do {
                number[ni++] = ch;
            } while(isdigit(ch = fgetc(fp)));
            if (ch == '.') {
                do {
                    number[ni++] = ch;
                } while(isdigit(ch = fgetc(fp)));
                TOKEN.type = FNUMBER;
            } else {
                TOKEN.type = INUMBER;
            }
            number[ni] = '\0';
            ni = 0;
            strcpy(TOKEN.str, number);
            printf("%s ", TOKEN.str);
            fseek(fp, -1, SEEK_CUR);
            return TOKEN;
        } else if (isDELIM(ch)) {
            TOKEN.type = getDel(ch);
            char *str = &ch;
            //working on getting strings and chars to have
            //their own value types
            strcpy(TOKEN.str, str);
            printf("%s ", TOKEN.str);
            return TOKEN;
        } else if (isRELOP(ch) || isUNIQUE(ch)) {
            switch (ch) {
                case '<':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = LE;
                        strcpy(TOKEN.str, "<=");
                    } else {
                        TOKEN.type = LT;
                        strcpy(TOKEN.str, "<");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '>':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = GE;
                        strcpy(TOKEN.str, ">=");
                    } else {
                        TOKEN.type = GT;
                        strcpy(TOKEN.str, ">");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '=':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = EQ;
                        strcpy(TOKEN.str, "==");
                    } else {
                        TOKEN.type = ASI;
                        strcpy(TOKEN.str, "=");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '!':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = NE;
                        strcpy(TOKEN.str, "!=");
                    } else {
                        TOKEN.type = NOT;
                        strcpy(TOKEN.str, "!");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '&':
                    if ((ch = fgetc(fp)) == '&') {
                        TOKEN.type = AND;
                        strcpy(TOKEN.str, "&&");
                    } else {
                        fprintf(stderr, "Token Error: & missing at line %d.\n", linenum);
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '|':
                    if ((ch = fgetc(fp)) == '|') {
                        TOKEN.type = OR;
                        strcpy(TOKEN.str, "||");
                    } else {
                        TOKEN.type = INHER;
                        strcpy(TOKEN.str, "|");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '#':
                    TOKEN.type = INC;
                    strcpy(TOKEN.str, "#");
            }
            printf("%s ", TOKEN.str);
            return TOKEN;
        } else if (isOPERA(ch)) {
            switch (ch) {
                case '+':
                    if ((ch = fgetc(fp)) == '+') {
                        TOKEN.type = INCRE;
                        strcpy(TOKEN.str, "++");
                    } else if (ch == '=') {
                        TOKEN.type = ADDE ;
                        strcpy(TOKEN.str, "+=");
                        fseek(fp, -1, SEEK_CUR);
                    } else {
                        TOKEN.type = ADD ;
                        strcpy(TOKEN.str, "+");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '-':
                    if ((ch = fgetc(fp)) == '-') {
                        TOKEN.type = DECRE;
                        strcpy(TOKEN.str, "--");
                    } else if (ch == '=') {
                        TOKEN.type = SUBE;
                        strcpy(TOKEN.str, "-=");
                        fseek(fp, -1, SEEK_CUR);
                    } else {
                        TOKEN.type = SUB;
                        strcpy(TOKEN.str, "-");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '*':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = MULTE;
                        strcpy(TOKEN.str, "*=");
                    } else {
                        TOKEN.type = MULT;
                        strcpy(TOKEN.str, "*");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '/':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = DIVE;
                        strcpy(TOKEN.str, "/=");
                    } else if (ch == '/') {
                        do {
                            continue;
                        } while ((ch = fgetc(fp)) != '\n');
                    } else {
                        TOKEN.type = DIV;
                        strcpy(TOKEN.str, "/");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '%':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = MODE;
                        strcpy(TOKEN.str, "%=");
                    } else {
                        TOKEN.type = MOD;
                        strcpy(TOKEN.str, "%");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
                case '^':
                    if ((ch = fgetc(fp)) == '=') {
                        TOKEN.type = EXE;
                        strcpy(TOKEN.str, "^=");
                    } else {
                        TOKEN.type = EX;
                        strcpy(TOKEN.str, "^");
                        fseek(fp, -1, SEEK_CUR);
                    } break;
            }
            printf("%s ", TOKEN.str);
            return TOKEN;
        }
    }
    free(word);
    free(number);
    TOKEN.type = EOFT;
    strcpy(TOKEN.str, "\0");
    printf("%s ", TOKEN.str);
    return TOKEN;
}
int isKEY(char *word) {
    int r = 0;
    for (int i = 0; i < 40; i++) {
        if (*word == *keywords[i]) {
            r = 1;
        }
    }
    return r;
}
int isDELIM(char ch) {
    int r = 0;
    for (int i = 0; i < 11; i++) {
        if (ch == delimeters[i]) {
            r = 1;
        }
    }
    return r;
}
int isRELOP(char ch) {
    int r = 0;
    if (ch == '<' || ch == '>') {
        r = 1;
    }
    return r;
}
int isUNIQUE(char ch) {
    int r = 0;
    for (int i = 0; i < 5; i++) {
        if (ch == unique_operators[i]) {
            r = 1;
        }
    }
    return r;
}
int isOPERA(char ch) {
    int r = 0;
    for (int i = 0; i < 6; i++) {
        if (ch == operators[i]) {
            r = 1;
        }
    }
    return r;
}
tokentype getKey(char *word) {
    tokentype type;
    if (!strcmp(word, "open")) { type = OPEN; }
    else if (!strcmp(word, "closed")) { type = CLOSED; }
    else if (!strcmp(word, "guarded")) { type = GUARDED; }
    else if (!strcmp(word, "artificial")) { type = ARTIFICIAL; }
    else if (!strcmp(word, "static")) { type = STATIC; }
    else if (!strcmp(word, "global")) { type = GLOBAL; }
    else if (!strcmp(word, "cont")) { type = CONT; }
    else if (!strcmp(word, "super")) { type = SUPER; }
    else if (!strcmp(word, "int")) { type = INT; }
    else if (!strcmp(word, "float")) { type = FLOAT; }
    else if (!strcmp(word, "char")) { type = CHAR; }
    else if (!strcmp(word, "string")) { type = STRING; }
    else if (!strcmp(word, "bool")) { type = BOOL; }
    else if (!strcmp(word, "collection")) { type = COLLECTION; }
    else if (!strcmp(word, "class")) { type = CLASS; }
    else if (!strcmp(word, "funct")) { type = FUNCT; }
    else if (!strcmp(word, "method")) { type = METHOD; }
    else if (!strcmp(word, "struct")) { type = STRUCT; }
    else if (!strcmp(word, "enum")) { type = ENUM; }
    else if (!strcmp(word, "if")) { type = IF; }
    else if (!strcmp(word, "else")) { type = ELSE; }
    else if (!strcmp(word, "or")) { type = _OR_; }
    else if (!strcmp(word, "do")) { type = DO; }
    else if (!strcmp(word, "until")) { type = UNTIL; }
    else if (!strcmp(word, "unless")) { type = UNLESS; }
    else if (!strcmp(word, "for")) { type = FOR; }
    else if (!strcmp(word, "foreach")) { type = FOREACH; }
    else if (!strcmp(word, "in")) { type = IN; }
    else if (!strcmp(word, "try")) { type = TRY; }
    else if (!strcmp(word, "catch")) { type = CATCH; }
    else if (!strcmp(word, "exception")) { type = EXCEPTION; }
    else if (!strcmp(word, "return")) { type = RETURN; }
    else if (!strcmp(word, "skip")) { type = SKIP; }
    else if (!strcmp(word, "break")) { type = BREAK; }
    else if (!strcmp(word, "term")) { type = TERM; }
    else if (!strcmp(word, "new")) { type = NEW; }
    else if (!strcmp(word, "call")) { type = CALL; }
    else if (!strcmp(word, "true")) { type = TRU; }
    else if (!strcmp(word, "false")) { type = FALS; }
    else if (!strcmp(word, "null")) { type = NIL; }
    return type;
}
tokentype getDel(char ch) {
    tokentype type;
    if (ch == '.') { type = DOT; }
    if (ch == ',') { type = COM; }
    if (ch == '(') { type = LP; }
    if (ch == ')') { type = RP; }
    if (ch == '{') { type = LB; }
    if (ch == '}') { type = RB; }
    if (ch == '[') { type = LBR; }
    if (ch == ']') { type = RBR; }
    if (ch == '"') { type = QUO; }
    if (ch == '\'') { type = APO; }
    if (ch == ';') { type = SEMCO; }
    return type;
}

In a separate or even on the same file as above, main() is able to open a file and read it sending the file stream to sscan(FILE *). And for the last piece a example file to read:

# call Sys;
# call SysIO;

static int num = 7;
cont global float fl = 12.895;
cont char letter = 'h';

collection Program {
    cont closed string aswkey = "abdcbabcdabdbcdab";
    open class quiz {
        bool decision = true;
        artificial method.study (int time) {};
        open quiz() {
            //do something
        };
    };
    class test | quiz {
        bool descision;
        super method.study (int time) {
            if (decision == true) {
                //do something
            } or if (time == 0) {
                //do something
            } else {
                //do nothing
            };
            time = 50;
        };
        guarded test (bool n) {
            descision = n;
        };
    };
    funct.Enter () {
        quiz Quiz = new quiz();
        test Test = new test(false);
        Test.study(60);
        if (something != this) {
            //do something
        };
        term;
    };
};
Sora
  • 87
  • 1
  • 8
  • 3
    Please try to create a [Minimal, Complete, and Verifiable Example](http://stackoverflow.com/help/mcve) and show us. It will be hard to impossible to *guess* otherwise. – Some programmer dude Mar 13 '17 at 00:23
  • 1
    However, the problem is most likely not where you think it is. Are you copying the `TOKEN` somewhere, member by member, and forget to copy the `type` member? Or do you go out of bounds of `TOKEN.str` anywhere? Oh and `TOKEN.str` *is* an array and not only an (uninitialized) pointer? Use a *debugger* to step through the code, while keeping an eye on the variables and their values. – Some programmer dude Mar 13 '17 at 00:26
  • 1
    if the `word` array is too small to hold long words, then too many characters will be put in there and overwrite subsequent memory, possibly the `type` field. The `do` loop should have a check against the maximum length. – Peter - Reinstate Monica Mar 13 '17 at 00:27
  • I see what you guys mean, well as you can see and for what you can't see; `TOKEN` is not copied, the bounds of both `word` and `TOKEN.str` are not out of bounds (if I knew it was going to be out of bounds, I would've already had something to tell me, but I'm sure a Kilobyte of data space is plenty sufficient for at least the test to hold strings). As for the point about memory overwriting, that's one reason for the large string size and also the reason I reset the memory before returning the `TOKEN`. – Sora Mar 13 '17 at 03:50
  • 1
    @Sora Btw. I may miss something but: the scanner converts a character stream to a token stream. Every token should be a _terminal_ in the grammar. (Otherwise it is an error token not fitting into the language.) _Non-terminals_ are like variables - they may not appear in the input ("word"). The [German Wikipedia: Nichtterminal](https://de.wikipedia.org/wiki/Nichtterminalsymbol) is quite clear about this but has unfortunately no English translation. However, [Wikipedia: Grammar](https://en.wikipedia.org/wiki/Formal_grammar) also notes about non-terminals... – Scheff's Cat Mar 13 '17 at 07:27
  • I know, it is the variables that are having the enumeration issue, but only some of them, can't understand why. – Sora Mar 13 '17 at 08:00
  • @Sora 1. I'm not quite sure whether the question was put on hold because you state your problem. May be not clear enough. 2. To make this a complete example you forgot to provide `main()`. This was simple. So, I did it by myself. 3. There is a syntax error in `sscan()`: `isSpringPunc()` not found. I replaced it by `isLangPunct()`. 4. There is a memory leak in `generate()`. Why do you use `malloc()` for `word` and `number`. Use local arrays -> no memory leak. 5. My actual hint with `default` statements for debugging was leading in the right direction. – Scheff's Cat Mar 13 '17 at 17:50
  • 1
    @Scheff I fixed the things you told me, but I also found that making the last else statement in `getKey(word)` corrected my above question. I would answer my own question extensively but it's still on hold. I also had to make some other changes that affect the outcomes of the enumerations. I'll post an answer when and if I can. – Sora Mar 13 '17 at 21:44
  • @Sora However. You fixed your problem - that's what counts. – Scheff's Cat Mar 14 '17 at 01:55
  • One tip I can give , try to recreate your problem as easy as possible, which will lead you to solution easily – Sorcrer Mar 14 '17 at 05:33
  • The `getKey` function returns an uninitialized variable if no cases match , you should have a "default" case at the end; or initialize the variable. – M.M Mar 16 '17 at 06:51

1 Answers1

0

So to understand what was went wrong, let's look at what happened. In the getKey(word) function, when a word was sent there to be compared against a keyword it would return the tokentype type to the TOKEN.type declaration. However if the word did not match any of the comparisons, then it still returned only this time as the variable of type which is of address 1953167781 on my computer specifically. So, by adding a final else statement to getKey(word) which set the tokentype type to ID, it matched all nonterminals to ID. This however does not explain why words like Sys and SysIO didn't also become 1953167781. And it is a bit confusing how if(isKEY(word))could not return false, it always seemed returned true. Though it had to return false to in order to go to the else statement that would effectively set all nonterminals to ID. But this still poses a problem which we will not discuss but should ponder: why would nonterminals like Sys and SysIO become ID's (note that the getKey(word) function does not have the new final else statement) inside of a always true if statement, when it has to return false in order for that to happen, and why would other similar words not have the same course taken? Regardless of that, it is working fine and I'll leave it to the experts to figure out why it did what it did in the previous statement.

Sora
  • 87
  • 1
  • 8