Why does printf make a segmentation fault if there is no \n?

Question

To learn multithreading, I use code given by the teacher, that works for my classmates, but not for me.

The error happens here:

void fonction2(){
    
    int compteur1 = 1;
    while(1){
        fflush(stdout);
        printf("je suis la coroutine1\n");
        fflush(stdout);
        printf("blabla");
        fflush(stdout);
        printf("test\n");
        printf("test2\n");
        printf("compteur = %d\n", compteur1);
        fflush(stdout);
        compteur1 = compteur1 +1;
        
        //yield();//appel la coroutine2
        exit(1);
    }
    
    

}

I'm not just looking for how to make my code work as intended, I'd like to understand what went wrong.

I compile using gcc 11.3.0 with flags -static -g -O0 in a Ubuntu1~22.04 VM.

Here is the entirety of my code:

#include<stdio.h>
#include<stdlib.h>
#define STACK_SIZE 4096

char pile1[STACK_SIZE] __attribute__((aligned(4096)));
char pile2[STACK_SIZE] __attribute__((aligned(4096)));
char pile3[STACK_SIZE] __attribute__((aligned(4096)));
char pile4[STACK_SIZE] __attribute__((aligned(4096)));

typedef void * coroutine_t;


void enter_coroutine(coroutine_t coroutine);
void switch_coroutine(coroutine_t *ct1, coroutine_t ct2);
coroutine_t coroutine_ordo;
coroutine_t coroutine4;
coroutine_t coroutine2;
coroutine_t coroutine3;

struct thread{
    coroutine_t coroutine;
    int statut; //O prêt 1 bloqué

};

struct thread thread2;
struct thread thread3;
struct thread thread4;

//struct liste_thread{
  //  thread* premier;
//};

struct thread* thread_courant ;


//comme un push pour préparer le enter_coroutine
coroutine_t init_coroutine(void *stack_begin, unsigned int stack_size, void (*initial_pc)(void)){
char *stack_end = ((char *)stack_begin) + stack_size;
void* *ptr = (void**) stack_end;
ptr--;
*ptr = initial_pc; //program counter
ptr--;
*ptr = 0; //rbp
ptr--;
*ptr = 0; // rbx
ptr--;
*ptr = 0; //r12
ptr--;
*ptr = 0; //r13
ptr--;
*ptr = 0; //r14
ptr--;
*ptr = 0; //r15

return ptr;
}

void yield(void){
    switch_coroutine(&thread_courant->coroutine, coroutine_ordo);
}




void fonction_ordo(){
    while(1){
        thread_courant = &thread2;
        switch_coroutine(&coroutine_ordo, thread2.coroutine);
        thread_courant = &thread3;
        switch_coroutine(&coroutine_ordo, thread3.coroutine);
        thread_courant = &thread4;
        switch_coroutine(&coroutine_ordo, thread4.coroutine);
    }
    

}

void fonction2(){
    
    int compteur1 = 1;
    while(1){
        fflush(stdout);
        printf("je suis la coroutine1\n");
        fflush(stdout);
        printf("blabla\n");
        fflush(stdout);
        printf("test\n");
        printf("test2\n");
        printf("compteur = %d\n", compteur1);
        fflush(stdout);
        compteur1 = compteur1 +1;
        
        //yield();//appel la coroutine2
        exit(1);
    }
    
    

}

void fonction3(){
    int compteur2 = 1;
    
    while(1){

        printf("je suis la coroutine3\n");
        printf("compteur = %d\n",compteur2);
        ++compteur2;
        yield();//appel la coroutine1;
    }
    

}


void fonction4(){
   
    int compteur2 = 1;
    while(1){
        printf("je suis la coroutine4\n");
        printf("compteur = %d\n",compteur2);
        ++compteur2;
        yield();//appel la coroutine1;
    }
    

}

int main(){
    setbuf(stdout, NULL);
    
    coroutine_ordo = init_coroutine(pile1, STACK_SIZE, &fonction_ordo); //ordonnanceur !
    printf("coroutine1 init\n");
    thread2.coroutine = init_coroutine(pile2, STACK_SIZE, &fonction2);
    printf("coroutine2 init\n");
    thread3.coroutine = init_coroutine(pile3, STACK_SIZE, &fonction3);
    printf("coroutine3 init\n");
    thread4.coroutine = init_coroutine(pile4, STACK_SIZE, &fonction4);
    printf("coroutine4 init\n");

    
    enter_coroutine(coroutine_ordo);
    
    return 0;
}

This is a file in assembly I also give to the compiler, where stack pointers are manipulated directly to move from a thread to another:

.global enter_coroutine, switch_coroutine

/* Note: le premier argument est dans ecx, le deuxieme dans edx. */
switch_coroutine:
        push %rbp
        push %rbx
        push %r12
        push %r13
        push %r14
        push %r15               
        mov %rsp,(%rdi)         /* Store stack pointer to the coroutine pointer.. */
        mov %rsi,%rdi           /* Continue to enter_coroutine, mais echange les arguments d'abord. */
        
enter_coroutine:
        mov %rdi,%rsp         /* Load the stack pointer from the coroutine pointer. */
        pop %r15
        pop %r14
        pop %r13
        pop %r12
        pop %rbx
        pop %rbp
        ret                   /* Pop the program counter. */

I tried tinkering with the code to understand the issue; there is always a segmentation fault somewhere in function2, but this displays (other stuff from the rest of the code) je suis la coroutine1 Erreur de segmentation (core dumped)

However, if I add "\n" right after "blabla" then it displays (other stuff) je suis la coroutine1 blabla test test2 Erreur de segmentation (core dumped)

I thought using fflush would guarantee the same behavior regardless of what argument I give printf? Why does it not display the test prints in the first case?

GCC will compile a simple `printf` call (one that has a string literal argument with no conversion specifiers) differently depending on whether the string ends with a newline or not. `printf("blabla\n")` may be compiled to the equivalent of `puts("blabla");`. So you may be seeing the side effect of calling a simpler function `puts` compared to the more complex `printf` when you add the `\n` to the end of the string. You can compile with `-fno-builtin-printf` to disable the conversion of `printf` to `puts`. — Ian Abbott, Dec 05 '22 at 11:57
It may be a stack alignment issue. The stack pointer should be aligned on a 16-byte boundary when a function is called. The return address will be stored below the 16-byte boundary, so when the program counter is at the first instruction of the function, rsp will be an odd multiple of 8 bytes. The function may reduce rsp by another odd multiple of 8 bytes ready for the next function call. You probably need to change `init_coroutine`, `switch_coroutine` and `enter_coroutine` to add an additional 8 bytes to the stack. — Ian Abbott, Dec 05 '22 at 12:49
Re, "...works for my classmates, but not for me." So, what is different about the environment in which you run the code? You told us what compiler version you use. What compiler version, and what command line arguments do _they_ use? You told us what OS you use. How about them? I am not expert in such things, but I wonder if there is some subtle way in which the `switch_coroutine` machinery is almost-but-not-quite compatible with your compiler, but fully (or at least more) compatible with the compiler(s) that your classmates use. — Solomon Slow, Dec 05 '22 at 15:03

Ian Abbott · Answer 1 · 2022-12-05T17:06:42.053

The only thing needed to get the code to work is a small change to init_coroutine to add a dummy 8-byte value to the base of the coroutine's stack:

//comme un push pour préparer le enter_coroutine
coroutine_t init_coroutine(void *stack_begin, unsigned int stack_size, void (*initial_pc)(void)){
    char *stack_end = ((char *)stack_begin) + stack_size;
    void* *ptr = (void**) stack_end;
    /* vvvvvvvvvvv */
    ptr--;
    *ptr = 0; // dummy value
    /* ^^^^^^^^^^^ */
    ptr--;
    *ptr = initial_pc; //program counter
    ptr--;
    *ptr = 0; //rbp
    ptr--;
    *ptr = 0; // rbx
    ptr--;
    *ptr = 0; //r12
    ptr--;
    *ptr = 0; //r13
    ptr--;
    *ptr = 0; //r14
    ptr--;
    *ptr = 0; //r15

    return ptr;
}

As long as stack_end is on a 16-byte boundary (by suitable choice of start_begin and stack_size parameters), this will keep %RSP+8 aligned to a 16-byte boundary at the start of each function as required by the System V ABI for x86-64.

The initial_pc value is stored on a 16-byte boundary. After enter_coroutine() executes the ret instruction for the first time, this initial_pc value will be in %RIP, and %RSP+8 will be at a 16-byte boundary as required by the ABI. (The coroutine function will reduce %RSP by an odd multiple of 8 bytes if it calls another function, so that %RSP is maintained on a 16-byte boundary before it calls the function.¹)

―
¹ Calling a function with arguments of type __m256 or __m512 on the stack requires %RSP to be aligned to a 32- or 64-byte boundary before the call instruction.

The reason why the original code goes wrong is that the GLIBC library functions assume that the stack pointer has been properly aligned when they are called. If that is not the case, bad things could happen.

The reason why adding a \n to the end of a string literal (with no % characters) passed to printf can make a difference is that GCC may convert printf("blabla\n"); to puts("blabla"); and a misaligned stack pointer in puts is probably not as bad as a misaligned stack pointer in printf. (This conversion from printf to puts can be disabled with the -fno-builtin-printf GCC compiler option.)

Why does printf make a segmentation fault if there is no \n?

1 Answers1