-3

I have a normal server application which hits SIGSEGV during booting. Code involved in SIGSEGV part contains pthread_cond_wait, pthread_setcancelstate, pthread_cond_timedwait, pthread_cleanup_push and pthread_cleanup_pop.

#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

pthread_mutex_t mutex;
pthread_cond_t cond;

#define CHECK_RETURN_VAL_OR_FAIL(ret,str) \
  ({ if ((ret) != 0) \
       { \
         printf ("%s failed: %s\n", (str), strerror (ret)); \
         ret = 1; \
         goto out; \
       } \
  })


void
clean (void *arg)
{
  puts ("clean: Unlocking mutex...");
  pthread_mutex_unlock ((pthread_mutex_t *) arg);
  puts ("clean: Mutex unlocked...");
}

void *
thr (void *arg)
{
  int ret = 0;
  pthread_mutexattr_t mutexAttr;
  ret = pthread_mutexattr_init (&mutexAttr);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_mutexattr_init");

  ret = pthread_mutexattr_setprotocol (&mutexAttr, PTHREAD_PRIO_INHERIT);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_mutexattr_setprotocol");

  ret = pthread_mutex_init (&mutex, &mutexAttr);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_mutex_init");

  ret = pthread_cond_init (&cond, 0);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_cond_init");

  puts ("th: Init done, entering wait...");
  pthread_cleanup_push (clean, (void *) &mutex);
  ret = pthread_mutex_lock (&mutex);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_mutex_lock");
  while (1)
    {
      ret = pthread_cond_wait (&cond, &mutex);
      CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_cond_wait");
    }
  pthread_cleanup_pop (1);

out:
  return (void *)ret;
}

int
main()
{
  pthread_t thread;
  int ret = 0;
  void *thr_ret = 0;
  ret = pthread_create (&thread, 0, thr, &thr_ret);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_create");

  puts ("main: Thread created, waiting a bit...");
  sleep (2);

  puts ("main: Cancelling thread...");
  ret = pthread_cancel (thread);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_cancel");

  puts ("main: Joining th...");
  ret = pthread_join (thread, NULL);
  CHECK_RETURN_VAL_OR_FAIL (ret, "pthread_join");

  if (thr_ret != NULL)
    return 1;

  puts ("main: Joined thread, done!");

out:
  return ret;
}

I tried gdb and it gave me this back trace

(gdb) bt
#0  0xf7318387 in ?? () from /lib/libgcc_s.so.1
#1  0xf7318820 in _Unwind_Resume () from /lib/libgcc_s.so.1
#2  0xf7dfbb52 in _Unwind_Resume () from /lib/libpthread.so.0
#3  0xf7df6d95 in __condvar_w_cleanup () from /lib/libpthread.so.0
#4  0x08048b01 in thr ()
#5  0xf7df2338 in start_thread () from /lib/libpthread.so.0
#6  0xf7d30aee in clone () from /lib/libc.so.6

I also took strace -f for this process and its children, turns out it hits SIGSEGV in child.

    [pid 18192] open("./tls/i686/sse2/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("./tls/i686/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("./tls/sse2/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("./tls/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("./i686/sse2/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("./i686/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("./sse2/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("./libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
    [pid 18192] open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
    [pid 18192] fstat64(3, {st_mode=S_IFREG|0644, st_size=94009, ...}) = 0
    [pid 18192] mmap2(NULL, 94009, PROT_READ, MAP_PRIVATE, 3, 0) = 0xf75de000
    [pid 18192] close(3)                    = 0
    [pid 18192] open("/lib/libgcc_s.so.1", O_RDONLY|O_CLOEXEC) = 3
    [pid 18192] read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\240%\0\0004\0\0\0"..., 512) = 512
    [pid 18192] fstat64(3, {st_mode=S_IFREG|0755, st_size=116284, ...}) = 0
    [pid 18192] mmap2(NULL, 119400, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xf6ab2000
    [pid 18192] mmap2(0xf6ace000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b000) = 0xf6ace000
    [pid 18192] close(3)                    = 0
    [pid 18192] mprotect(0xf6ace000, 4096, PROT_READ) = 0
    [pid 18192] munmap(0xf75de000, 94009)   = 0
    [pid 18192] tgkill(18192, 18193, SIGRTMIN) = 0
    [pid 18192] futex(0xf7310ba8, FUTEX_WAIT, 18193, NULL <unfinished ...>
    [pid 18193] <... futex resumed> )       = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
    [pid 18193] --- SIGRTMIN {si_signo=SIGRTMIN, si_code=SI_TKILL, si_pid=18192, si_uid=3535917} ---
    [pid 18193] futex(0xf6acf10c, FUTEX_WAKE_PRIVATE, 2147483647) = 0
    [pid 18193] futex(0x804b0a4, FUTEX_WAKE_PRIVATE, 2147483647) = 0
    [pid 18193] --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} ---
    [pid 18193] +++ killed by SIGSEGV (core dumped) +++
    +++ killed by SIGSEGV (core dumped) +++

Operating System : SUSE 12.4
Kernel : 4.12.14-95.83-default 
gcc : 4.8.5 
glibc : 2.22

I'm still not able to understand why this application is hitting SIGSEGV.

Compilation :

gcc sapl.c -m32 -pthread

Works for

gcc sapl.c -pthread

Any ideas ?

Edit 1 : Original code source was different and for more threads, strace was corresponding to that, when user sonicwave asked for minimal reproducible code, I had changed the code and gdb trace but missed strace, that's why there was anamoly.

Also, in this code, for reproducing this issue, only one thread is enough.

Again, this issue is occurring in specified gcc version, glibc version only. If I use older compiler like 4.3.7 , this issue does not reproduce.

  • Try putting together a minimal example that illustrates the issue (see https://stackoverflow.com/help/minimal-reproducible-example). As it is, there's simply too much missing to be able to really say what's wrong. For instance, how does the `comnev` struct look? How is it initialized? – sonicwave Jan 18 '22 at 08:07
  • @sonicwave I have updated code with minimal reproducible example now. – Digvijaysingh Gour Jan 18 '22 at 10:03
  • 1
    The first rule of `pthread_cancel()` is "do not call `pthread_cancel()`". – John Bollinger Jan 18 '22 at 23:07
  • The code presented shows a program in which the main thread creates only one additional thread, but the `strace` output appears to correspond to a program with at least 11 threads. Does the code presented actually reproduce the issue for you? It does not reproduce the issue for me. – John Bollinger Jan 19 '22 at 16:36
  • Why does each thread [re]initialize the mutex? That in and of itself is a disaster waiting to happen. – Andrew Henle Jan 19 '22 at 19:37

1 Answers1

0

Function thr() initializes the mutex and condition variable itself, so these are initialized separately by every thread running that function. When there is more than one such thread, that results in re-initializations of already-initialized mutex and cv objects (if we suppose that the init functions implement appropriate synchronization) or data races (if we do not assume synchronization), which produces undefined behavior either way. Although no further explanation of the effects is required, in this case you might think about what could happen when one thread modifies the mutex out from under another thread that is already using it.

That is just as true when the program is compiled as a 64-bit executable as when it is compiled as a 32-bit executable, but there is no particular reason for the UB to manifest the same way in those two cases. Indeed, it is not guaranteed even to manifest the same way in multiple runs of the very same executable -- that's part of what "undefined" means.

At the time I write this, the example program presented in the question launches only one thread running thr(), so the above would not explain the reported errors. On the other hand, the program in fact does not manifest any errors for me, whether built as a 32-bit or a 64-bit executable. Additionally, the strace output presented appears to correspond to a program with at least 11 threads, so I am inclined to guess that the program in which the errors were actually observed indeed did have multiple threads behaving analogously to the presented thr() function.

The best solution is to initialize the mutex and condition variable before any of the threads that want to use it start. If that's not possible then you must ensure that only one thread performs the initialization, and that it does so before any of the other threads try to use those objects, in the sense of formal "happens before" relationships. That would probably require introducing an additional synchronization object to use in support of the initialization, so it's unclear whether there is actually anything to gain that way.

John Bollinger
  • 160,171
  • 8
  • 81
  • 157