13

Here is the piece of code in which segmentation fault occurs (the perror is not being called):

job = malloc(sizeof(task_t));
if(job == NULL)
    perror("malloc");

To be more precise, gdb says that the segfault happens inside a __int_malloc call, which is a sub-routine call made by malloc.

Since the malloc function is called in parallel with other threads, initially I thought that it could be the problem. I was using version 2.19 of glibc.

The data structures:

typedef struct rv_thread thread_wrapper_t;

typedef struct future
{
  pthread_cond_t wait;
  pthread_mutex_t mutex;
  long completed;
} future_t;

typedef struct task
{
  future_t * f;
  void * data;
  void *
  (*fun)(thread_wrapper_t *, void *);
} task_t;

typedef struct
{
  queue_t * queue;
} pool_worker_t;

typedef struct
{
  task_t * t;
} sfuture_t;

struct rv_thread
{
  pool_worker_t * pool;
};

Now the future implementation:

future_t *
create_future()
{
  future_t * new_f = malloc(sizeof(future_t));
  if(new_f == NULL)
    perror("malloc");
  new_f->completed = 0;
  pthread_mutex_init(&(new_f->mutex), NULL);
  pthread_cond_init(&(new_f->wait), NULL);
  return new_f;
}

int
wait_future(future_t * f)
{
  pthread_mutex_lock(&(f->mutex));
  while (!f->completed)
    {
      pthread_cond_wait(&(f->wait),&(f->mutex));
    }
  pthread_mutex_unlock(&(f->mutex));
  return 0;
}

void
complete(future_t * f)
{
  pthread_mutex_lock(&(f->mutex));
  f->completed = 1;
  pthread_mutex_unlock(&(f->mutex));
  pthread_cond_broadcast(&(f->wait));
}

The thread pool itself:

pool_worker_t *
create_work_pool(int threads)
{
  pool_worker_t * new_p = malloc(sizeof(pool_worker_t));
  if(new_p == NULL)
    perror("malloc");
  threads = 1;
  new_p->queue = create_queue();
  int i;
  for (i = 0; i < threads; i++){
    thread_wrapper_t * w = malloc(sizeof(thread_wrapper_t));
    if(w == NULL)
      perror("malloc");
    w->pool = new_p;
    pthread_t n;
    pthread_create(&n, NULL, work, w);
  }
  return new_p;
}

task_t *
try_get_new_task(thread_wrapper_t * thr)
{
  task_t * t = NULL;
  try_dequeue(thr->pool->queue, t);
  return t;
}

void
submit_job(pool_worker_t * p, task_t * t)
{
  enqueue(p->queue, t);
}

void *
work(void * data)
{
  thread_wrapper_t * thr = (thread_wrapper_t *) data;
  while (1){
    task_t * t = NULL;
    while ((t = (task_t *) try_get_new_task(thr)) == NULL);
    future_t * f = t->f;
    (*(t->fun))(thr,t->data);
    complete(f);
  }
  pthread_exit(NULL);
}

And finally the task.c:

pool_worker_t *
create_tpool()
{
  return (create_work_pool(8));
}

sfuture_t *
async(pool_worker_t * p, thread_wrapper_t * thr, void *
(*fun)(thread_wrapper_t *, void *), void * data)
{
  task_t * job = NULL;
  job = malloc(sizeof(task_t));
  if(job == NULL)
    perror("malloc");
  job->data = data;
  job->fun = fun;
  job->f = create_future();
  submit_job(p, job);
  sfuture_t * new_t = malloc(sizeof(sfuture_t));
  if(new_t == NULL)
    perror("malloc");
  new_t->t = job;
  return (new_t);
}

void
mywait(thread_wrapper_t * thr, sfuture_t * sf)
{
  if (sf == NULL)
    return;
  if (thr != NULL)
    {
      while (!sf->t->f->completed)
        {
          task_t * t_n = try_get_new_task(thr);
          if (t_n != NULL)
            {
          future_t * f = t_n->f;
          (*(t_n->fun))(thr,t_n->data);
          complete(f);
            }
        }
      return;
    }
  wait_future(sf->t->f);
  return ;
}

The queue is the lfds lock-free queue.

#define enqueue(q,t) {                                 \
    if(!lfds611_queue_enqueue(q->lq, t))             \
      {                                               \
        lfds611_queue_guaranteed_enqueue(q->lq, t);  \
      }                                               \
  }

#define try_dequeue(q,t) {                            \
    lfds611_queue_dequeue(q->lq, &t);               \
  }

The problem happens whenever the number of calls to async is very high.

Valgrind output:

Process terminating with default action of signal 11 (SIGSEGV)
==12022==  Bad permissions for mapped region at address 0x5AF9FF8
==12022==    at 0x4C28737: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
guilhermemtr
  • 528
  • 1
  • 4
  • 15
  • 1
    Is it possible something else messes up the bookkeeping for `malloc` ? – cnicutar Feb 26 '14 at 19:30
  • 1
    It sounds like memory gets corrupted somewhere else. – imreal Feb 26 '14 at 19:31
  • It's the only explanation, i'll post the whole code. (It's really a minimal model, with memory leaks, etc). – guilhermemtr Feb 26 '14 at 19:32
  • 1
    "if needed i can put here the full source code" - yes, this is probably what you SHOULD do, because the piece of code above by itself cannot imply the source of the segfault. – barak manos Feb 26 '14 at 19:33
  • @barakmanos ok, done (took some time putting the 4 spaces everywhere, but then i found out the ctrl+k shortcut :) ) – guilhermemtr Feb 26 '14 at 19:38
  • @barakmanos is this code enough? Because i can put all the code, including the example im running, and the queue i use. The queue i use is from lfds lib, and the example is a simple fibonaci that spawns two tasks and waits for them). – guilhermemtr Feb 26 '14 at 19:45
  • I don't know what `try_deque` is but it seems you should be sending it a pointer of a pointer `try_dequeue(thr->pool->queue, &t);` like this. – imreal Feb 26 '14 at 19:45
  • the try_dequeue returns a task. the code i have put works, for small computations, but for big ones is where the problems start – guilhermemtr Feb 26 '14 at 19:46
  • It doesn't return anything, it is probably meant to point `t` to a task. It can't do that if you send it a copy of the pointer. – imreal Feb 26 '14 at 19:48
  • @Nick I've updated the post, in order to answer you. I know that it's weird the way im currently doing it. – guilhermemtr Feb 26 '14 at 19:51
  • @Nick (It's in the macro where i put the &t, althought i shouldn't) – guilhermemtr Feb 26 '14 at 19:56
  • @guilhermemtr Alright, I see. – imreal Feb 26 '14 at 19:58
  • Others have mentioned heap corruption - that is, write-past-end errors, etc. You might consider using the [glibc facilities](http://www.gnu.org/software/libc/manual/html_node/Unconstrained-Allocation.html#Unconstrained-Allocation) such as heap-checking with `mcheck / mprobe`. – Brett Hale Feb 26 '14 at 20:01
  • 1
    Any chance of running the program under valgrind? If memory corruption is going on, valgrind might be able to show you where and when. – Jeremy Friesner Feb 26 '14 at 20:14
  • @JeremyFriesner Sorry, i added the output in the questoin. – guilhermemtr Feb 26 '14 at 20:18
  • Could this be a bug in malloc? – guilhermemtr Feb 26 '14 at 20:26
  • @cnicutar I think not, but it's more probable than a bug in malloc – guilhermemtr Feb 26 '14 at 20:36
  • Sorry just to be sure we are testing the same code.. can you comment out enqueue(p->queue, t); try_dequeue(thr->pool->queue, t); e new_p->queue = create_queue() ? Does it still crash? – Jekyll Feb 26 '14 at 20:40
  • @Jekyll i can't coment it out without changing the goal of this program. But if you want, i can send you the full source, including the main, the Makefile, and so on, so you can fully test it. (Putting it all here is just way too much, and i would probably introduce much confusion i whoever else reads this post). What the enqueue and try_dequeue do is, they put a task into the global queue of the pool and de try_dequeue gets a task from the global queue of the pool. – guilhermemtr Feb 26 '14 at 20:44
  • @guilhermemtr I will past here code for you before. Try this. – Jekyll Feb 26 '14 at 20:46
  • @guilhermemtr I pasted the code down here, try this first, if it doesn't help you I can help you debugging your code. – Jekyll Feb 26 '14 at 20:52
  • @Jekyll Thank you, however that doesn't help me much, since the problem only happens after a lot of calls to the malloc operation – guilhermemtr Feb 26 '14 at 20:55
  • I saw similar issue during process termination when main thread is already finished but some worker threads are still continue execution - this happening because heap is already destroyed – AlexT Feb 27 '14 at 10:58

2 Answers2

20

I've figured out what the problem is: a stack overflow.

First, let me explain why the stack overflow occurs inside malloc (which is probably why you are reading this). When my program was run, the stack size kept increasing each time it started executing (recursively) another task (because of the way I had programmed it). But for each such time, I had to allocate a new task using malloc. However, malloc makes other sub-routine calls, which make the stack increase its size even more than a simple call to execute another task. So, what was happening was that, even if there was no malloc, I would get a stack overflow. However, because I had malloc, the moment the stack overflowed was in malloc, before it overflowed by making another recursive call. The illustration bellow shows what was happening:

Initial stack state:

-------------------------
| recursive call n - 3  |
-------------------------
| recursive call n - 2  |
-------------------------
| recursive call n - 1  |
-------------------------
|        garbage        |
-------------------------
|        garbage        | <- If the stack passes this point, the stack overflows.
-------------------------

stack during malloc call:

-------------------------
| recursive call n - 3  |
-------------------------
| recursive call n - 2  |
-------------------------
| recursive call n - 1  |
-------------------------
|        malloc         |
-------------------------
|     __int_malloc      | <- If the stack passes this point, the stack overflows.
-------------------------

Then the stack shrank again, and my code entered a new recursive call:

-------------------------
| recursive call n - 3  |
-------------------------
| recursive call n - 2  |
-------------------------
| recursive call n - 1  |
-------------------------
| recursive call n      |
-------------------------
|        garbage        | <- If the stack passes this point, the stack overflows.
-------------------------

Then, it invoked malloc again inside this new recursive call. However, this time it overflowed:

-------------------------
| recursive call n - 3  |
-------------------------
| recursive call n - 2  |
-------------------------
| recursive call n - 1  |
-------------------------
| recursive call n      |
-------------------------
|        malloc         | <- If the stack passes this point, the stack overflows.
-------------------------
|     __int_malloc      | <- This is when the stack overflow occurs.
-------------------------

[The rest of the answer is more focused around why I had this problem in my code in particular.]

Usually, when computing Fibonacci recursively, for example, of a certain number n, the stack size grows linearly with that number. However, in this case I'm creating tasks, using a queue to store them, and dequeuing a (fib) task for execution. If you draw this on paper, you'll see that the number of tasks grows exponentially with the n, rather than linearly (also note that if I had used a stack to store the tasks as they were created, the number of tasks allocated as well as the stack size would only grow linearly with n. So what happens is that the stack grows exponentially with n, leading to a stack overflow... Now comes the part why this overflow occurs inside the call to malloc. So basically, as I explained above, the stack overflow happened inside the malloc call because it was where the stack was largest. What happened was that the stack was almost exploding, and since malloc calls functions inside it, the stack grows more than just the calling of mywait and fib.

Thank you all! If it wasn't your help i wouldn't be able to figure it out!

guilhermemtr
  • 528
  • 1
  • 4
  • 15
  • 1
    That was what I was guessing as I couldn't find any problem. But to ensure this is the problem can you dump the 'top' output on a file and check how the memory usage increase? +1 for both answer and question. – Jekyll Feb 27 '14 at 12:03
  • when i removed all the threads, valgrind said this can be a stack overflow, althought it's unlikely. I've set the ulimit bigger and i could then run bigger fib nums. when i duplicate the stack size, i can only add 1 to the previous number. But i'll do as you said, just to confirm – guilhermemtr Feb 27 '14 at 12:11
16

A SIGSEGV (segmentation fault) is firing in malloc is usually caused by heap corruption. Heap corruption does not cause a segmentation fault, so you would see that only when malloc tries to access there. The problem is that the code that creates the heap corruption could be in any point even far away from where the malloc is called. It is usually the next-block pointer inside the malloc that is changed by your heap corruption to an invalid address, so that when you call malloc an invalid pointer gets dereferenced and you get a segmentation fault.

I think you may try portions of your code isolated from the rest of the program to reduce the visibility of the bug.

Moreover I see that you never free the memory here and there can be a possible memory leak.

In order to check a memory leak you can run the top command top -b -n 1 and check:

RPRVT - resident private address space size
RSHRD - resident shared address space size
RSIZE - resident memory size
VPRVT - private address space size
VSIZE - total memory size
Jekyll
  • 1,434
  • 11
  • 13
  • 1
    The problem is that the segmentation fault only happens after a lot of calls. – guilhermemtr Feb 26 '14 at 20:52
  • did you see if there is a memory leakage? I didn't see any free here.... do you free memory sometime? – Jekyll Feb 26 '14 at 20:53
  • I will run into a problem if I don't free memory sooner or later... as this program only allocates here... – Jekyll Feb 26 '14 at 20:54
  • This is just a minimal model, in the original version it doesn't have memory leaks. in this one i just wanted to check the why of the error, by eliminating as much code as possible. So in this version i only want to find the malloc problem. – guilhermemtr Feb 26 '14 at 20:56
  • If you guess it's a malloc problem you should just allocate and free (to avoid going out of heap) memory tousands of time (malloc doesn't know your structure), so you don't need the full program, but this is very unlikely to happen @guilhermemtr – Jekyll Feb 26 '14 at 21:04
  • I am going to remove the code and post a real answer before someone starts with -1 :) – Jekyll Feb 26 '14 at 21:12
  • Yes, i agree that's very unlikely, and that's why i have put it here. It's most likely a buffer overflow, but i don't have a clue of where. – guilhermemtr Feb 26 '14 at 21:16
  • buffer overflow is a nasty beast – Jekyll Feb 26 '14 at 21:17
  • I would vote up your answer since it helps, however i don't have 15 points yet :( (and i still don't know where the error is :( ) But thanks for the help you are giving :) – guilhermemtr Feb 26 '14 at 21:18
  • I think exactly the same. – guilhermemtr Feb 26 '14 at 21:18
  • Try to change compiler option it may crash quicker... try with -O3 @guilhermemtr – Jekyll Feb 26 '14 at 21:19
  • I've tried that, and now the malloc fails inside the lfds611_queue_guaranteed_enqueue (But exactly the same error) :( – guilhermemtr Feb 26 '14 at 21:26
  • I've edited the question with electric fence + mcheck output. (I already had tested with electric fence, but i did not knew the mcheck call) – guilhermemtr Feb 26 '14 at 21:32
  • it's almost just this plus a main (+- 40 lines of code (simple code)) and plus the headers – guilhermemtr Feb 26 '14 at 21:34
  • send on, I will have a look later. I think that all your test prove a memory corruption somewhere. – Jekyll Feb 26 '14 at 21:36