0

I'm working on writing a small, proof of concept fiber library in C using the usual makecontext/swapcontext routines, however that's been giving me some trouble (My platform is OSX 10.9 Mavericks, using clang-503.0.40).

Here's the data structures I'm dealing with:

typedef enum {
    /// Fiber is waiting to start execution
    FIBER_PENDING,

    /// Fiber is in the midst of executing
    FIBER_EXECUTING,

    /// Fiber has finished executing
    FIBER_FINISHED,

    /// Fiber is in the process of yielding
    FIBER_YIELDING
} fiber_state;

typedef struct {
    char *stack;
    fiber_state state;
    ucontext_t context;
} fiber;

Here's the mini-library (three functions, fiber_init, fiber_run, and fiber_yield:

#include <string.h>
#include <stdlib.h>
#include <signal.h>
#include <assert.h>
#include <stdio.h>

// Generic log w/ file name and line number
#define LOG(x) fprintf(stderr, "%s:%d |-> %s\n", __FILE__, __LINE__, x)

// the current executing fiber in this thread (or NULL of none are executing)
// (TODO: make this TLS)
static fiber *current_fiber = NULL;

/// prepare fiber 'f' to be run
void fiber_init(fiber *f, void(* fiber_func)()) {
    // zero out the fiber
    memset(f, 0, sizeof(fiber));

    f->state = FIBER_PENDING;
    f->stack = (char*) malloc(SIGSTKSZ);

    // init stack config of the fiber context
    ucontext_t *f_context = &(f->context);

    getcontext(f_context);
    f_context->uc_stack.ss_sp    = f->stack;
    f_context->uc_stack.ss_size  = SIGSTKSZ;
    f_context->uc_stack.ss_flags = 0;

    // initialize the context
    makecontext(f_context, fiber_func, 0);
}

/// Deallocate resources associated with 'f'
void fiber_destroy(fiber *f) {
    free(f->stack);
}

/// Start or resume fiber 'f'
void fiber_run(fiber *f) {
    // context to switch back to when yielding, or when the fiber returns
    ucontext_t return_context;
    f->context.uc_link = &return_context;

    // save the fiber being swapped away from (or NULL)
    fiber *old_fiber = current_fiber;
    current_fiber = f;

    LOG("Swapping into fiber context");

    getcontext(&return_context);
    int status = swapcontext(
        &return_context,
        &(f->context));
    assert(status == 0 && "Failed to swap to fiber context");

    LOG("Back to parent context from swap");

    if(f->state == FIBER_YIELDING) {
        f->state = FIBER_EXECUTING;
        LOG("Fiber yielded");
    }
    else {
        LOG("Fiber done executing; marking as finished");
        current_fiber->state = FIBER_FINISHED;
    }

    // restore old fiber
    current_fiber = old_fiber;
}

/// from witin a fiber, yield control to the caller's context
void fiber_yield() {
    assert(current_fiber && "No fiber is currently running!");

    current_fiber->state = FIBER_YIELDING;

    LOG("Yielding back to caller context");
    int status = swapcontext(
        &(current_fiber->context),
        current_fiber->context.uc_link);
    assert(status == 0 && "Failed to swap to parent context");
    LOG("Swapped back into fiber context (post-yield)");
}

/// query fiber state
int fiber_is_pending(const fiber *const f) {
    return f->state == FIBER_PENDING;
}
int fiber_is_finished(const fiber *const f) {
    return f->state == FIBER_FINISHED;
}
int fiber_is_executing(const fiber *const f) {
    return f->state == FIBER_EXECUTING;
}

It seems though, that calling fiber_yield() within a fiber does not properly swap the context with the caller's context (a reference of which is stored in the fiber context's uc_link, see current_fiber->context.uc_link within fiber_yield)

A trace of running this program:

void my_func() {
    LOG(" ------- I'm the fiber function! yielding");
    fiber_yield();
    LOG(" ------- End of my_func");
}

int main() {
    fiber f;
    fiber_init(&f, my_func);

    while(!fiber_is_finished(&f)) {
        fiber_run(&f);
        LOG("Back in main run loop");
    }

    fiber_destroy(&f);
    return 0;
}

yields the output:

fibers.c:70 |-> Swapping into fiber context
test_harness.c:5 |->  ------- I'm the fiber function! yielding
fibers.c:99 |-> Yielding back to caller context
Segmentation fault: 11

I've read that OSX has stack alignment restrictions (to 16 byte boundaries), but I'm using malloc to allocate the stacks, which returns a block that is aligned to a 16 byte boundary (or so I've read). That said, it seems that rearranging the order of declarations can cause a segfault not to happen sometimes, but it's very spurious and hard to reproduce.

Inspecting fiber_yield right before calling swapcontext shows that the current_fiber->context has a very large stack size; way larger than it should be. Perhaps this is a sign of corruption:

(lldb) p current_fiber->context
(ucontext_t) $3 = {
  uc_onstack = 0
  uc_sigmask = 0
  uc_stack = (ss_sp = 0x00007fff5fbff720, ss_size = 140734799804176, ss_flags = 0)
  uc_link = 0x00007fff5fbff780
  uc_mcsize = 0
  uc_mcontext = 0x00007fff5fbff828
}
(lldb) p *(current_fiber->context.uc_link)
(__darwin_ucontext) $4 = {
  uc_onstack = -541067328
  uc_sigmask = 0
  uc_stack = (ss_sp = 0x00007fff5fbff700, ss_size = 8388608, ss_flags = 0)
  uc_link = 0x0000000000000000
  uc_mcsize = 140734799804400
  uc_mcontext = 0x00007fff5fbff7b8
}

Any clue what might be going on? Thanks!

dymk
  • 887
  • 2
  • 10
  • 21
  • First, give more details about the crash. What address was it trying to access? In which stack or struct is that? What's with `_swapcontext()` (with underscore) in `fiber_yield()`? You shouldn't need to call `getcontext()` just before `swapcontext()` in `fiber_run()`. Although the `sigaltstack()` docs explicitly say that you don't need to figure the direction of stack growth when setting up a `stack_t` for that function, could it be that you do when configuring one for the `ucontext` functions? That is, maybe `ss_sp` should be the top of the stack. Maybe `SIGSTKSZ` isn't big enough. – Ken Thomases May 25 '14 at 07:20
  • Oops, `_swapcontext` was a typo; it was `swapcontext`. I have underscored versions that suppress deprecation warnings on OSX, which take up quite a bit of console space when compiling (they fall through directly to the real functions). lldb indicates that the failing call is the swapcontext in fiber_yield, I need to do a bit more research on what addresses it's trying to access though. – dymk May 25 '14 at 07:29
  • I'll update this with more information tomorrow, but in the meantime I've added some lldb logs that might indicate some memory corruption. The stack size should be fine; I've tried allocating four times as much and no change. – dymk May 25 '14 at 07:38
  • That bogus `ss_size` is `0x7fff5fbff710` which is suspiciously like the rest of the pointers. So, yes, some sort of corruption has happened. – Ken Thomases May 25 '14 at 07:41

2 Answers2

1

I was able to reproduce the same problem using your code, compiled with Apple's gcc-4.2.1, on OS X 10.6.8.

I noticed that you don't include ucontext.h. Compiling with -Wall causes the compiler to warn about implicit declarations of the ucontext functions.

Adding #include <ucontext.h> caused an error:

In file included from foo.c:6:
/usr/include/ucontext.h:42:2: error: #error ucontext routines are deprecated, and require _XOPEN_SOURCE to be defined

Adding #define _XOPEN_SOURCE before all includes fixed that and also the behavior of the program. Evidently, that macro changes the layout of the relevant structures to match what those functions expect and require.

I'm not sure what to tell you about the fact that those functions are deprecated. I know of no supported replacements.

Ken Thomases
  • 88,520
  • 7
  • 116
  • 154
  • I'd like to stress the "Adding #define _XOPEN_SOURCE before *all includes* fixed that" I had a `#define _XOPEN_SOURCE` before ucontext.h, but yep, turns out it needed to be before all the includes in the program. That fixed it, thanks! – dymk May 25 '14 at 20:43
0

Wow! I've been scratching my head the past couple days trying to figure out why the pointer in uc_link looked corrupted. Especially because an similar program had been working flawlessly earlier.

Indeed it was because I had refactored a header! #define _XOPEN_SOURCE at the top of all my files did the trick.

I guess the inclusion of that define changes the layout of critical structs throughout the codebase.