3

When running the following C file, copying the character to fgetc to my tmp pointer results in unknown characters being copied over for some reason. The characters received from fgetc() are the expected characters. However, for some reason when assigning this character to my tmp pointer unknown characters get copied over.

I've tried looking for the reason why online, but haven't found any luck. From what I have read it could be something to do with UTF-8 and ASCII issues. However, I'm not sure about the fix. I'm a relatively new C programmer and still new to memory management.

Output:

TMP: Hello, DATA!�
TEXT: Hello, DATA!�

game.c:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <allegro5/allegro5.h>
#include <allegro5/allegro_font.h>

const int WIN_WIDTH = 1366;
const int WIN_HEIGHT = 768;

char *readFile(const char *fileName) {
    FILE *file;
    file = fopen(fileName, "r");

    if (file == NULL) {
        printf("File could not be opened for reading.\n");
    }

    size_t tmpSize = 1;
    char *tmp = (char *)malloc(tmpSize);

    if (tmp == NULL) {
        printf("malloc() could not be called on tmp.\n");
    }

    for (int c = fgetc(file); c != EOF; c = fgetc(file)) {
        if (c != NULL) {
            if (tmpSize > 1)
                tmp = (char *)realloc(tmp, tmpSize);

            tmp[tmpSize - 1] = (char *)c;
            tmpSize++;
        }
    }
    tmp[tmpSize] = 0;

    fclose(file);
    printf("TMP: %s\n", tmp);
    return tmp;
}

int main(int argc, char **argv) {
    al_init();
    al_install_keyboard();

    ALLEGRO_TIMER* timer = al_create_timer (1.0 / 30.0);
    ALLEGRO_EVENT_QUEUE *queue = al_create_event_queue();

    ALLEGRO_DISPLAY* display = al_create_display(WIN_WIDTH, WIN_HEIGHT);
    ALLEGRO_FONT* font = al_create_builtin_font();

    al_register_event_source(queue, al_get_keyboard_event_source());
    al_register_event_source(queue, al_get_display_event_source(display));
    al_register_event_source(queue, al_get_timer_event_source(timer));

    int redraw = 1;
    ALLEGRO_EVENT event;

    al_start_timer(timer);

    char *text = readFile("game.DATA");
    printf("TEXT: %s\n", text);

    while (1) {
        al_wait_for_event(queue, &event);
        if (event.type == ALLEGRO_EVENT_TIMER)
            redraw = 1;
        else if ((event.type == ALLEGRO_EVENT_KEY_DOWN) || (event.type == ALLEGRO_EVENT_DISPLAY_CLOSE))
            break;
        
        if (redraw && al_is_event_queue_empty(queue)) {
            al_clear_to_color(al_map_rgb(0, 0, 0));
            al_draw_text(font, al_map_rgb(255, 255, 255), 0, 0, 0, text);
            al_flip_display();

            redraw = false;
        }
    }

    free(text);
    al_destroy_font(font);
    al_destroy_display(display);
    al_destroy_timer(timer);
    al_destroy_event_queue(queue);

    return 0;
}

game.DATA file:

Hello, DATA!

What I use to run the program:

gcc game.c -o game $(pkg-config allegro-5 allegro_font-5 --libs --cflags)

--EDIT--

I tried taking the file reading code and running it in a new c file, for some reason it works there, but not when in the game.c file with allegro code.

test.c:

#include <stdlib.h>
#include <stdio.h>

char *readFile(const char *fileName) {
    FILE *file;
    file = fopen(fileName, "r");

    if (file == NULL) {
        printf("File could not be opened for reading.\n");
    }

    size_t tmpSize = 1;
    char *tmp = (char *)malloc(tmpSize);

    if (tmp == NULL) {
        printf("malloc() could not be called on tmp.\n");
    }

    for (int c = fgetc(file); c != EOF; c = fgetc(file)) {
        if (c != NULL) {
            if (tmpSize > 1)
                tmp = (char *)realloc(tmp, tmpSize);

            tmp[tmpSize - 1] = (char *)c;
            tmpSize++;
        }
    }
    tmp[tmpSize] = 0;

    fclose(file);
    printf("TMP: %s\n", tmp);
    return tmp;
}

void main() {
    char *text = readFile("game.DATA");
    printf("TEXT: %s\n", text);

    free(text);
    return 0;
}

Produces the correct output always:

TMP: Hello, DATA!
TEXT: Hello, DATA!
chqrlie
  • 131,814
  • 10
  • 121
  • 189
  • 3
    Hello, please provide a [mcve] to help answerer to focus on the specific issue. – Amessihel Feb 05 '21 at 20:23
  • 2
    It's a bit silly to check the return value of `malloc` when you don't check `realloc`. If you're not checking both you may as well check neither. That aside, it looks like you're probably not allocating enough space for the string, and Allegro is stepping on the terminating zero-byte. I'd double-check that `realloc` loop, it seems fishy. – trent Feb 05 '21 at 21:30
  • Not your problem, but: you don't want this: `if (c != NULL)`. – Steve Summit Feb 05 '21 at 21:50
  • Also not your problem, but you don't want the cast in `tmp[tmpSize-1] = (char *) c`. Just use `tmp[tmpSize-1] = c;`. – Steve Summit Feb 05 '21 at 21:57
  • `File could not be opened for reading.` is the canonical example of a useless error message. Try: `file = fopen(fileName, "r"); if (file == NULL) {perror(fileName); exit(EXIT_FAILURE);`} – William Pursell Feb 05 '21 at 22:58

2 Answers2

4

When you write a loop that updates various things each time through, like you do with tmpSize in your loop here, it's important to have a handle on what the theoretical computer science types call your "loop invariants". That is, what is it that's true each time through the loop? It's important not only to maintain your loop invariants properly, but also to pick your loop invariants so that they're easy to maintain, and easy for a later reader to understand and to verify.

Since tmpSize starts out as 1, I'm guessing your loop invariant is trying to be, "tmpSize is always one more than the size of the string I've read so far". A reason for picking that slightly-strange loop invariant is, of course, that you'll need that extra byte for the terminating \0. The other clue is that you're setting tmp[tmpSize-1] = c;.

But here's the first problem. When we exit the loop, and if tmpSize is still one more than the size of the string you've read so far, let's see what happens. Suppose we read three characters. So tmpSize should be 4. So we'll set tmp[4] = 0;. But wait! Remember, arrays in C are 0-based. So the three characters we read are in tmp[0], tmp[1], and tmp[2], and we want the terminating \0 character to go into tmp[3], not tmp[4]. Something is wrong.

But actually, it's worse than that. I wasn't at all sure I understood the loop invariant, so I cheated, and inserted a few debugging printouts. Right before the realloc call, I added

printf("realloc %zu\n", tmpSize);

and at the end, right before the tmp[tmpSize] = 0; line, I added

printf("final %zu\n", tmpSize);

The last few lines it printed (while reading a game.DATA file containing "Hello, DATA!" just like yours) were:

...
realloc 10
realloc 11
realloc 12
final 13

But this is off by two! If the last reallocation gave the array a size of 12, the valid indices are from 0 to 11. But somehow we end up writing the \0 into cell 13.

It took me a while to figure it out, but the second problem is that you do the reallocation at the top of the loop, before you've incremented tmpLen.

To me, the loop invariant of "one more than the size of the string read so far" is just too hard to think about. I very much prefer to use a loop invariant where the "size" variable keeps track of the number of characters I have read, not +1 or -1 off of that. Let's see how that loop might look. (I've also cleaned up a few other things.)

size_t tmpSize = 0;
char *tmp = malloc(tmpSize+1);
if (tmp == NULL) {
    printf("malloc() failed.\n");
    exit(1);
}

for (int c = getc(file); c != EOF; c = getc(file)) {
    printf("realloc %zu\n", tmpSize+1+1);
    tmp = realloc(tmp, tmpSize+1+1);        /* +1 for c, +1 for \0 */
    if (tmp == NULL) {
        printf("realloc() failed.\n");
        exit(1);
    }
    tmp[tmpSize] = c;
    tmpSize++;
}

printf("final %zu\n", tmpSize);
tmp[tmpSize] = '\0';

There's still something fishy here -- I said I didn't like "fudge factors" like +1, and here I've got two -- but at least now the debugging printouts go

...
realloc 11
realloc 12
realloc 13
final 12

so it looks like I'm not overrunning the allocated memory any more.

To make this even better, I want to take a slightly different approach. You're not supposed to worry abut efficiency at first, but I can tell you that a loop that calls realloc to make the buffer bigger by 1, each time it reads a character, can end up being really inefficient. So let's make a few more changes:

size_t nchAllocated = 0;
size_t nchRead = 0;
char *tmp = NULL;

for (int c = getc(file); c != EOF; c = getc(file)) {
    if(nchAllocated <= nchRead) {
        nchAllocated += 10;
        printf("realloc %zu\n", nchAllocated);
        tmp = realloc(tmp, nchAllocated);
        if (tmp == NULL) {
            printf("realloc() failed.\n");
            exit(1);
        }
    }
    tmp[nchRead++] = c;
}

printf("final %zu\n", nchRead);
tmp[nchRead] = '\0';

Now there are two separate variables: nchAllocated keeps track of exactly how many characters I've allocated, and nchRead keeps track of exactly how many characters I've read. And although I've doubled the number of "counter" variables, in doing so I've simplified a lot of other things, so I think it's a net improvement.

First of all, notice that there are no +1 fudge factors any more, at all.

Second, this loop doesn't call realloc every time -- instead it allocates characters 10 at a time. And because there are separate variables for the number of characters allocated versus read, it can keep track of the fact that it may have allocated more characters than it has read so far. For this code, the debugging printouts are:

realloc 10
realloc 20
final 12

Another little improvement is that we don't have to "preallocate" the array -- there's no initial malloc call. One of our loop invariants is that nchAllocated is the number of characters allocated, and we start this out as 0, and if there are no characters allocated, then it's okay that tmp starts out as NULL. This relies on the fact that when you call realloc for the first time, with tmp equal to NULL, realloc is fine with that, and essentially acts like malloc.

But there's one question you might be asking: If I got rid of all my fudge factors, where do we arrange to allocate one extra byte to hold the terminating \0 character? It's there, but it's subtle: it's lurking in the test

if(nchAllocated <= nchRead)

The very first time through the loop, nchAllocated will be 0, and nchRead will be 0, but this test will be true, so we'll allocate our first chunk of 10 characters, and we're off and running. (If we didn't care about the \0 character, the test nchAllocated < nchRead would have sufficed.)

...But, actually, I've made a mistake! There's a subtle bug here!

What if the file being read is empty? tmp will start out NULL, and we'll never make any trips through the loop, so tmp will remain NULL, so when we assign tmp[nchRead] = 0 it'll blow up.

And actually, it's worse than that. If you trace through the logic very carefully, you'll find that any time the file size is an exact multiple of 10, not enough space gets allocated for the \0, after all.

And this indicates a significant drawback of the "allocate characters 10 at a time" scheme. The code is now harder to test, because the control flow is different for files whose size is a multiple of 10. If you never happen to test that case, you won't realize that this program has a bug in it.

The way I usually fix this is to notice that the \0 byte I have to add to terminate the string is sort of balanced by the EOF character I read that indicated the end of the file. Maybe, when I read the EOF, I can use it to remind me to allocate space for the \0. That's actually easy enough to do, and it looks like this:

int c;
while(1) {
    c = getc(file);
    if(nchAllocated <= nchRead) {
        nchAllocated += 10;
        printf("realloc %zu\n", nchAllocated);
        tmp = realloc(tmp, nchAllocated);
        if (tmp == NULL) {
            printf("realloc() failed.\n");
            exit(1);
        }
    }

    if(c == EOF)
        break;

    tmp[nchRead++] = c;
}

printf("final %zu\n", nchRead);
tmp[nchRead] = '\0';

The trick here is that we don't test for EOF until after we've checked that there's enough space in the buffer, and called realloc if necessary. It's as if we allocate space in the buffer for the EOF -- except then we use that space for the \0 instead. This is what I meant by "use it to remind me to allocate space for the \0".

Now, I have to admit that there's still a drawback here, in that the loop is now somewhat unconventional. A loop that has while(1) at the top looks like an infinite loop. This one has

if(c == EOF) break;

down in the middle of it, so it is literally a "break in the middle" loop. (This is by contrast to conventional for and while loops, which are "break at the top", or a do/while loop, which is "break at the bottom".) Personally, I find this to be a useful idiom, and I use it all the time. But some programmers, and perhaps your instructor, would frown on it, because it's "weird", it's "different", it's "unconventional". And to some extent they're right: unconventional programming is somewhat dangerous programming, and is bad if later maintenance programmers can't understand it because they don't recognize or don't understand the idioms in it. (It's sort of the programming equivalent of the English word "ain't", or a split infinitive.)

Finally, if you're still with me, I have one more point to make. (And if you are still with me, thank you. I realize this answer has gotten very long, but I hope you're learning something.)

Earlier I said that "a loop that calls realloc to make the buffer bigger by 1, each time it reads a character, can end up being really inefficient." It turns out that a loop that makes the buffer bigger by 10 isn't much better, and can still be significantly inefficient. You can do a little better by incrementing it by 50 or 100, but if you're dealing with input that might be really big (thousands of characters or more), you're usually better off increasing the buffer size by leaps and bounds, perhaps by multiplying it by some factor, rather than adding. So here's the final version of that part of the loop:

    if(nchAllocated <= nchRead) {
        if(nchAllocated == 0) nchAllocated = 10;
        else nchAllocated *= 2;
    printf("realloc %zu\n", nchAllocated);
    tmp = realloc(tmp, nchAllocated);

And even this improvement -- multiplying by 2, rather than adding something -- comes with a cost: we need an extra test, to special-case the first trip through the loop, because nchAllocated started out as 0, and 0 × 2 = 0.

Steve Summit
  • 45,437
  • 7
  • 70
  • 103
  • Thanks for the long response, realise I've got a lot to learn when it comes to memory management so will continue to study more on it. Learned a lot from the response though, thanks. – connorwmackay Feb 06 '21 at 00:19
  • 1
    @connorwmackay Glad you made it all the way through. And I hope you're not overwhelmed or dismayed by it. This reply might make it look like the problem of "reading a file into dynamically-allocated memory" is nearly impossible: But no, it's not, and indeed once you know what you have to be careful of it's perfectly straightforward. But you **do** need to know a few things to be careful of. That's pretty much the nature of C programming. – Steve Summit Feb 06 '21 at 00:35
4

Your reallocation scheme is incorrect: the array is always too short by one byte and the null terminator is written one position past the end of the string, instead of at the end of the string. This causes an extra byte to be printed, with whatever value happens to be in memory in the block returned by realloc(), which is uninitialized.

It is less confusing to use tmpLen as the length of the string read si far and allocate 2 extra bytes for the newly read character and the null terminator.

Furthermore the test c != NULL makes no sense: c is byte and NULL is a pointer. Similarly, tmp[tmpSize - 1] = (char *)c; is incorrect: you should just write

tmp[tmpSize - 1] = c;

Here is a corrected version:

char *readFile(const char *fileName) {
    FILE *file = fopen(fileName, "r");

    if (file == NULL) {
        printf("File could not be opened for reading.\n");
        return NULL;
    }

    size_t tmpLen = 0;
    char *tmp = (char *)malloc(tmpLen + 1);

    if (tmp == NULL) {
        printf("malloc() could not be called on tmp.\n");
        fclose(file);
        return NULL;
    }

    int c;
    while ((c = fgetc(file)) != EOF) {
        char *new_tmp = (char *)realloc(tmp, tmpLen + 2);
        if (new_tmp == NULL) {
            printf("realloc() failure for %zu bytes.\n", tmpLen + 2);
            free(tmp);
            fclose(file);
            return NULL;
        }
        tmp = new_tmp;
        tmp[tmpLen++] = c;
    }
    tmp[tmpLen] = '\0';

    fclose(file);
    printf("TMP: %s\n", tmp);
    return tmp;
}

It is usually better to reallocate in chunks or with a geometric size increment. Here is a simple implementation:

char *readFile(const char *fileName) {
    FILE *file = fopen(fileName, "r");

    if (file == NULL) {
        printf("File could not be opened for reading.\n");
        return NULL;
    }

    size_t tmpLen = 0;
    size_t tmpSize = 16;
    char *tmp = (char *)malloc(tmpSize);
    char *newTmp;

    if (tmp == NULL) {
        printf("malloc() could not be called on tmp.\n");
        fclose(file);
        return NULL;
    }

    int c;
    while ((c = fgetc(file)) != EOF) {
        if (tmpSize - tmpLen < 2) {
            size_t newSize = tmpSize + tmpSize / 2;
            newTmp = (char *)realloc(tmp, newSize);
            if (newTmp == NULL) {
                printf("realloc() failure for %zu bytes.\n", newSize);
                free(tmp);
                fclose(file);
                return NULL;
            }
            tmpSize = newSize;
            tmp = newTmp;
        }
        tmp[tmpLen++] = c;
    }
    tmp[tmpLen] = '\0';

    fclose(file);
    printf("TMP: %s\n", tmp);

    // try to shrink allocated block to the minimum size
    // if realloc() fails, return the current block
    // it seems impossible for this reallocation to fail
    // but the C Standard allows it.
    newTmp = (char *)realloc(tmp, tmpLen + 1);
    return newTmp ? newTmp : tmp;
}
chqrlie
  • 131,814
  • 10
  • 121
  • 189
  • Thanks for the response, definitely need to read into memory management more so will start doing that – connorwmackay Feb 06 '21 at 00:24
  • @connorwmackay: very good point. Understanding memory management and more generally memory usage is key to mastering pointers, data structures, complexity, avoiding bloat and leakage. This knowledge is a strong advantage when moving to other languages where data representation is abstracted from programmers and out of their control. – chqrlie Feb 06 '21 at 11:40