-1

I'm trying to program a trie in C to read a file and add all the words in the file to the trie, and it works well, but I can't get it to accept apostrophes:

 typedef struct node
{
    bool wordBool;
    struct node* next[27]; // 26 letters and one space for the apostrophe
}
node;

node* base;
int numWords = 0;

bool load(const char* dictionary)
{

FILE* dictionaryf = fopen(dictionary, "r"); // the file to read

base = malloc(sizeof(node));

node variable;
node *currNode = &variable;

int n = 0;

while((n = fgetc(dictionaryf)) != EOF)
{
   if (n == '\n')
   {
      if (!currNode->wordBool)
      {
          currNode->wordBool = true;
          numWords++;
      }
      currNode = base;
   }
   else if (n == 39) //I tried putting this in so it would accept apostrophes
    {
         if(currNode->next[n-39] == NULL)
        {
            currNode->next[n-39] = malloc(sizeof(node));
        } 
        currNode = currNode->next[n-39];
    }
   else {
        if(currNode->next[n-96] == NULL)
        {
            currNode->next[n-96] = malloc(sizeof(node));
        }      
        currNode = currNode->next[n-96];
   }
}
if (currNode!= base && !currNode->wordBool)
{
    currNode->wordBool = true; 
    numWords++;
}
printf("%i\n", numWords);
fclose(dictionaryf);
return true;
}

this is the code that makes the trie, but it won't put apostrophes into the trie

MLShax
  • 13
  • 6
  • Hard coded numbers are generally (and certainly in this case) a terrible thing. Fortunately the language lets you use things like `'a'` and `'\''` to represent characters as their numeric parts. That said, it looks like your logic is placing your apostrophies in slot 0, but I'm concerned at your lack of range checking... what if the character is neither '\'' nor between 'a' and 'z' inclusive? – mah May 09 '16 at 01:15
  • Thanks, I fixed both of the problems you mentioned, the dictionary file I'm using only contains apostrophes and characters from the alphabet, so it shouldn't be a problem but I fixed it anyways since it could cause errors if I used a different load file. :) – MLShax May 09 '16 at 01:48
  • Note that `malloc` doesn't clear the memory, so you need to clear the memory yourself, or use `calloc` to allocate the memory. Also, you allocate memory for `base` but then point `currNode` at `variable`. That's not likely to end well. – user3386109 May 09 '16 at 02:09

1 Answers1

2

This code is closely based on yours, but tackles a slightly different problem in that it accepts an arbitrary text file and processes it regardless of the characters in it. It treats accented characters as 'non-alphabetic' in the typical style of English-speakers (in part because it doesn't use setlocale(), and in part because it doesn't process multi-byte or wide characters). It counts the number of times each word is present (which takes up no extra space in the data structure on a 64-bit machine). It includes a printing function which is important for checking that it did the job correctly, of course.

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct node
{
    bool wordBool;
    int  wordCount;
    struct node *next[27]; // 26 letters and one space for the apostrophe
} node;

static const char trie_map[] = "'abcdefghijklmnopqrstuvwxyz";
static node *base = 0;
static int numWords = 0;

static void oom(void)
{
    fprintf(stderr, "Out of memory\n");
    exit(EXIT_FAILURE);
}

static int trie_index(char c)
{
    char *p = strchr(trie_map, tolower(c));
    if (p == 0)
        return -1;
    else
        return (p - trie_map);
}

static
bool load(const char *dictionary)
{
    FILE *dictionaryf = fopen(dictionary, "r"); // the file to read
    if (dictionaryf == 0)
        return false;

    base = calloc(sizeof(node), 1);

    node *currNode = base;

    int n;

    while ((n = fgetc(dictionaryf)) != EOF)
    {
        n = trie_index(n);
        if (n >= 0)
        {
            if (currNode->next[n] == NULL)
            {
                currNode->next[n] = calloc(sizeof(node), 1);
                if (currNode->next[n] == NULL)
                    oom();
            }
            currNode = currNode->next[n];
        }
        else if (currNode != base)
        {
            if (!currNode->wordBool)
            {
                currNode->wordBool = true;
                numWords++;
            }
            currNode->wordCount++;
            currNode = base;
        }
        /* else: consecutive non-letters, non-apostrophes */
    }

    if (currNode != base && !currNode->wordBool)
    {
        currNode->wordBool = true;
        numWords++;
    }
    printf("%i distinct words\n", numWords);
    fclose(dictionaryf);
    return true;
}

static void print_trie(node *trie, char *buffer, size_t buflen)
{
    if (trie != 0)
    {
        if (trie->wordBool)
            printf("Word: %3d [%s]\n", trie->wordCount, buffer);
        size_t len = strlen(buffer);
        if (len >= buflen - 2)
        {
            fprintf(stderr, "Word too long!\n[%s]\n", buffer);
            exit(EXIT_FAILURE);
        }
        for (int i = 0; i < 27; i++)
        {
            if (trie->next[i] != 0)
            {
                buffer[len] = trie_map[i];
                buffer[len+1] = '\0';
                print_trie(trie->next[i], buffer, buflen);
            }
        }
    }
}

int main(int argc, char **argv)
{
    const char *data = "data";
    if (argc == 2)
        data = argv[1];
    if (load(data))
    {
        printf("Loaded file '%s' OK\n", data);
        char buffer[256] = "";
        print_trie(base, buffer, sizeof(buffer));
    }
    else
        printf("Load failed!\n");

    return 0;
}

When run on its own source code (trie-31.c), it produces:

94 distinct words
Loaded file 'trie-31.c' OK
Word:   3 [']
Word:   1 ['abcdefghijklmnopqrstuvwxyz]
Word:   1 [and]
Word:   1 [apostrophe]
Word:   1 [apostrophes]
Word:   2 [argc]
Word:   2 [argv]
Word:   7 [base]
Word:   2 [bool]
Word:  10 [buffer]
Word:   3 [buflen]
Word:   2 [c]
Word:   2 [calloc]
Word:   8 [char]
Word:   1 [consecutive]
Word:   3 [const]
Word:   1 [ctype]
Word:  14 [currnode]
Word:   1 [d]
Word:   5 [data]
Word:   2 [dictionary]
Word:   4 [dictionaryf]
Word:   1 [distinct]
Word:   4 [else]
Word:   1 [eof]
Word:   4 [exit]
Word:   1 [failed]
Word:   2 [failure]
Word:   1 [false]
Word:   1 [fclose]
Word:   1 [fgetc]
Word:   3 [file]
Word:   1 [fopen]
Word:   2 [for]
Word:   2 [fprintf]
Word:   5 [h]
Word:   7 [i]
Word:  14 [if]
Word:   5 [include]
Word:   2 [index]
Word:   7 [int]
Word:   4 [len]
Word:   2 [letters]
Word:   3 [load]
Word:   1 [loaded]
Word:   1 [long]
Word:   1 [main]
Word:   4 [map]
Word:   1 [memory]
Word:  16 [n]
Word:   7 [next]
Word:   8 [node]
Word:   2 [non]
Word:   2 [null]
Word:   4 [numwords]
Word:   1 [of]
Word:   1 [ok]
Word:   1 [one]
Word:   2 [oom]
Word:   1 [out]
Word:   3 [p]
Word:   3 [print]
Word:   4 [printf]
Word:   1 [r]
Word:   1 [read]
Word:   5 [return]
Word:   2 [s]
Word:   1 [s']
Word:   2 [size]
Word:   3 [sizeof]
Word:   1 [space]
Word:   7 [static]
Word:   1 [stdbool]
Word:   2 [stderr]
Word:   1 [stdio]
Word:   1 [stdlib]
Word:   1 [strchr]
Word:   1 [string]
Word:   1 [strlen]
Word:   2 [struct]
Word:   2 [t]
Word:   2 [the]
Word:   1 [to]
Word:   1 [tolower]
Word:   1 [too]
Word:  15 [trie]
Word:   3 [true]
Word:   1 [typedef]
Word:   3 [void]
Word:   1 [while]
Word:   2 [word]
Word:   6 [wordbool]
Word:   3 [wordcount]
Word:   1 [words]

Some of the words include apostrophes (there's ' on its own, 'abcdefghijklmnopqrstuvwxyz and s'). For the file great.panjandrum which contains:

So she went into the garden
to cut a cabbage-leaf
to make an apple-pie
and at the same time
a great she-bear coming down the street
pops its head into the shop
What no soap
So he died
and she very imprudently married the Barber
and there were present
the Picninnies
and the Joblillies
and the Garyulies
and the great Panjandrum himself
with the little round button at top
and they all fell to playing the game of catch-as-catch-can
till the gunpowder ran out at the heels of their boots

The output is:

66 distinct words
Loaded file 'great.panjandrum' OK
Word:   2 [a]
Word:   1 [all]
Word:   1 [an]
Word:   7 [and]
Word:   1 [apple]
Word:   1 [as]
Word:   3 [at]
Word:   1 [barber]
Word:   1 [bear]
Word:   1 [boots]
Word:   1 [button]
Word:   1 [cabbage]
Word:   1 [can]
Word:   2 [catch]
Word:   1 [coming]
Word:   1 [cut]
Word:   1 [died]
Word:   1 [down]
Word:   1 [fell]
Word:   1 [game]
Word:   1 [garden]
Word:   1 [garyulies]
Word:   2 [great]
Word:   1 [gunpowder]
Word:   1 [he]
Word:   1 [head]
Word:   1 [heels]
Word:   1 [himself]
Word:   1 [imprudently]
Word:   2 [into]
Word:   1 [its]
Word:   1 [joblillies]
Word:   1 [leaf]
Word:   1 [little]
Word:   1 [make]
Word:   1 [married]
Word:   1 [no]
Word:   2 [of]
Word:   1 [out]
Word:   1 [panjandrum]
Word:   1 [picninnies]
Word:   1 [pie]
Word:   1 [playing]
Word:   1 [pops]
Word:   1 [present]
Word:   1 [ran]
Word:   1 [round]
Word:   1 [same]
Word:   3 [she]
Word:   1 [shop]
Word:   2 [so]
Word:   1 [soap]
Word:   1 [street]
Word:  13 [the]
Word:   1 [their]
Word:   1 [there]
Word:   1 [they]
Word:   1 [till]
Word:   1 [time]
Word:   3 [to]
Word:   1 [top]
Word:   1 [very]
Word:   1 [went]
Word:   1 [were]
Word:   1 [what]
Word:   1 [with]
Jonathan Leffler
  • 730,956
  • 141
  • 904
  • 1,278