1

I am normalizing a 3D vector, and the clang-9 generated code throws a SIGFPE on the sqrtf() even though I do a test before calling it.

Note that I run with FP exceptions enabled.

                        const float lensq = dx*dx + dy*dy + dz*dz;
                        float invlen = 1.0f;
                        if (lensq > FLT_EPSILON)
                        {
                                const float leng = sqrtf(lensq);
                                invlen = 1.0f / leng;
                        }

Which clang-9 creates this assembly for:

                        const float lensq = dx*dx + dy*dy + dz*dz;
     a11:       44 0f bf f1             movswl %cx,%r14d
     a15:       44 89 f0                mov    %r14d,%eax
     a18:       41 0f af c6             imul   %r14d,%eax
     a1c:       44 0f bf ee             movswl %si,%r13d
     a20:       44 89 e9                mov    %r13d,%ecx
     a23:       41 0f af cd             imul   %r13d,%ecx
     a27:       01 c1                   add    %eax,%ecx
     a29:       44 0f bf fa             movswl %dx,%r15d
     a2d:       44 89 f8                mov    %r15d,%eax
     a30:       41 0f af c7             imul   %r15d,%eax
     a34:       c5 f8 28 c7             vmovaps %xmm7,%xmm0
                        if (lensq > FLT_EPSILON)
     a38:       01 c8                   add    %ecx,%eax
     a3a:       74 2d                   je     a69 <surface_extract_cases+0x7f9>
                        const float lensq = dx*dx + dy*dy + dz*dz;
     a3c:       c5 a2 2a c0             vcvtsi2ss %eax,%xmm11,%xmm0
     a40:       c4 c1 78 2e c0          vucomiss %xmm8,%xmm0
     a45:       72 09                   jb     a50 <surface_extract_cases+0x7e0>
     a47:       c5 fa 51 c0             vsqrtss %xmm0,%xmm0,%xmm0
     a4b:       eb 18                   jmp    a65 <surface_extract_cases+0x7f5>
     a4d:       0f 1f 00                nopl   (%rax)
     a50:       c5 f8 77                vzeroupper
     a53:       e8 00 00 00 00          callq  a58 <surface_extract_cases+0x7e8>
     a58:       c4 41 39 ef c0          vpxor  %xmm8,%xmm8,%xmm8
     a5d:       c5 fa 10 3d 00 00 00    vmovss 0x0(%rip),%xmm7        # a65 <surface_extract_cases+0x7f5>
     a64:       00
                                invlen = 1.0f / leng;

The SIGFPE shows this callstack:

#0  __kernel_standard (x=-1952511232, y=-1952511232, type=126) at ../sysdeps/ieee754/k_standard.c:584
#1  0x00007ffff5ea18e1 in __kernel_standard_f (x=<optimized out>, y=<optimized out>, type=<optimized out>) at ../sysdeps/ieee754/k_standardf.c:32
#2  0x000000000042a458 in mc_process_case_instances (caseidx=<optimized out>, numcases=31, fielddensity=0x3c171110, fieldtype=0x3c232510, isoval=<optimized out>, outputv=0x9c0d194 <scratch_surface_v+96744708>, outputn=0x12c0d194 <scratch_surface_n+96744708>, outputm=0x17e8ae3c <scratch_surface_m+32248236>, cases=<optimized out>) at ../../src/osino/src/surface.c:535
#3  surface_extract_cases (fielddensity=0x3c171110, fieldtype=0x3c232510, cases=<optimized out>, isoval=43, gridoff=<optimized out>, xlo=1120, xhi=48, ylo=32, yhi=48, zlo=16, zhi=32, outputv=0x9c09c90 <scratch_surface_v+96731136>, outputn=0x12c09c90 <scratch_surface_n+96731136>, outputm=0x17e89c90 <scratch_surface_m+32243712>, maxtria=65536, threadnr=1) at ../../src/osino/src/surface.c:745

The SIGFPE seems to be inside the __kernel_standard() call, and caused by dividing 0 by 0.

If I look up type '126' in __kernel_standard() then I see it is: sqrtf(negative) and this code is invoked:

case 126:
        /* sqrt(x<0) */
        exc.type = DOMAIN;
        exc.name = type < 100 ? "sqrt" : "sqrtf";
        if (_LIB_VERSION == _SVID_)
          exc.retval = zero;
        else
          exc.retval = zero/zero;
        if (_LIB_VERSION == _POSIX_)
          errno = EDOM;
        else if (!matherr(&exc)) {
          /* if (_LIB_VERSION == _SVID_) {
            (void) WRITE2("sqrt: DOMAIN error\n", 19);
              } */
          errno = EDOM;
        }
        break;

...causing this callstack in the debugger:

Thread 8 "noisetuner" received signal SIGFPE, Arithmetic exception.
[Switching to Thread 0x7fffccbdd700 (LWP 5838)]
─── Assembly ─────────────────────────────────────────────────────────────────────────────────────────
0x00007ffff5ea038c __kernel_standard+8236 nopl   0x0(%rax)
0x00007ffff5ea0390 __kernel_standard+8240 pxor   %xmm0,%xmm0
0x00007ffff5ea0394 __kernel_standard+8244 cmp    $0x2,%eax
0x00007ffff5ea0397 __kernel_standard+8247 divsd  %xmm0,%xmm0
0x00007ffff5ea039b __kernel_standard+8251 movsd  %xmm0,0x30(%rsp)
0x00007ffff5ea03a1 __kernel_standard+8257 jne    0x7ffff5e9f13b <__kernel_standard+3547>
0x00007ffff5ea03a7 __kernel_standard+8263 mov    0x38bc0a(%rip),%rax        # 0x7ffff622bfb8
─── Expressions ──────────────────────────────────────────────────────────────────────────────────────
─── History ──────────────────────────────────────────────────────────────────────────────────────────
─── Memory ───────────────────────────────────────────────────────────────────────────────────────────
─── Registers ────────────────────────────────────────────────────────────────────────────────────────
   rax 0x0000000000000002            rbx 0x00007ffff622c148            rcx 0x0000000058e94a00        
   rdx 0x00007ffff5f5a3c7            rsi 0x0000000000008d90            rdi 0x000000000000007e        
   rbp 0x00007fffccbdca30            rsp 0x00007fffccbdc640             r8 0x0000000000000006        
    r9 0x0000000000000005            r10 0x0000000000000038            r11 0x00007ffff5ea1460        
   r12 0x0000000000000001            r13 0x00000000ffff8d90            r14 0x00000000ffff9db0        
   r15 0x00000000ffff8e10            rip 0x00007ffff5ea0397         eflags [ PF ZF IF RF ]           
    cs 0x00000033                     ss 0x0000002b                     ds 0x00000000                
    es 0x00000000                     fs 0x00000000                     gs 0x00000000                
─── Source ───────────────────────────────────────────────────────────────────────────────────────────
Cannot display "/build/glibc-2ORdQG/glibc-2.27/math/../sysdeps/ieee754/k_standard.c" ([Errno 2] No such file or directory: '/build/glibc-2ORdQG/glibc-2.27/math/../sysdeps/ieee754/k_standard.c')
─── Stack ────────────────────────────────────────────────────────────────────────────────────────────
[0] from 0x00007ffff5ea0397 in __kernel_standard+8247 at ../sysdeps/ieee754/k_standard.c:584
arg x = -1952511232
arg y = -1952511232
arg type = 126
[1] from 0x00007ffff5ea18e1 in __kernel_standard_f+17 at ../sysdeps/ieee754/k_standardf.c:32
arg x = <optimized out>
arg y = <optimized out>
arg type = <optimized out>
[+]

The issue occurs with clang-9 -O3, but does not occur with clang-9 -O0 argument.

The full command line I use:

clang-9 -D_GNU_SOURCE -DAPPVER=1.00 -DUSECOREPROFILE -DNOUSESTEAM -DXWIN -DLANDSCAPE -DBLKMAG=6 -USTORECHARS -USTOREFP16 -DSTORESHORTS -I/home/bram/src/stb/ -I../GBase/src -Isrc -I../../src/dutch-blunt/src -I../../src/osino/src -I../../src/osino/src/../externals/enoki/include -I/usr/local/cuda/include -I/home/bram/src/zstd/lib -I../../src/ThreadTracer -IModels.game/geom `/opt/ode-master/bin/ode-config  --cflags` `/usr/bin/sdl2-config --cflags` -g -Wall -pedantic -Wno-missing-braces -mavx2 -mfma -mf16c -MMD -MP -O3 -DDEBUG   -c -o ../../src/osino/src/surface.o ../../src/osino/src/surface.c

Why is clang computing the sqrt of a negative number? Is it trying to do speculative execution, and blend the results based on the lensq > FLT_EPSILON test? Is that even valid?

Steve Friedl
  • 3,929
  • 1
  • 23
  • 30
Bram
  • 7,440
  • 3
  • 52
  • 94
  • 1
    Are dx, dy and dz integers? Are you sure they all have legitimate values? Could squaring them cause integer overflow? – rici Aug 14 '20 at 00:21
  • 1
    @rici They are short integers, and indeed overflow, causing lensq to be negative. But the test should stop it from being executed! – Bram Aug 14 '20 at 00:32
  • Show the assembly code generated by the compiler. – Eric Postpischil Aug 14 '20 at 00:35
  • 1
    "But the test should stop it from being executed!" --> No. It optimized UB. Since `lensq` "can't" be negative, calling `sqrtf(lensq)` should not be a problem. Get rid of UB, then proceed. – chux - Reinstate Monica Aug 14 '20 at 00:35
  • @Bram: That is invalid reasoning. The fact that the C standard does not define the behavior upon integer overflow means both that the compiler can assume that, if any of `dx`, `dy`, or `dz` is known to be non-zero, that `lensq > FLT_EPSILON` will be true and that the compiler can generate instructions that produce a negative value and pass it to `sqrtf`. – Eric Postpischil Aug 14 '20 at 00:38
  • 1
    If “short integers” means `short`, how do they overflow? They will be promoted to `int`. If `short` is 16-bit and `int` is 32, there can be overflow only if all three are near 2^15. That is mathematically possible, but it suggests an unusual case, provoking curiosity about the enveloping code. I appreciate this may be embedded in some code so that it is thorny to extract, but you may need to provide an [mre]. – Eric Postpischil Aug 14 '20 at 00:44
  • @EricPostpischil Good observation. Note it is "only if all three are near 2^15" _in magnitude_. Or 2 of 3 are `USHRT_MIN`. – chux - Reinstate Monica Aug 14 '20 at 00:48

1 Answers1

3

throws a domain error, even thoug I guard against negative numbers

But if (lensq > FLT_EPSILON) is too late as earlier dx*dx + dy*dy + dz*dz caused int overflow. "and indeed overflow, causing lensq to be negative" - which is undefined behavior UB.

Compiler can take advantage that sqrtf(lensq) can always work because it can assume dx*dx + dy*dy + dz*dz >= 0 and so lensq >= 0.0f.

Get rid of UB.

// const float lensq = dx*dx + dy*dy + dz*dz;
const float lensq = 1LL*dx*dx + 1LL*dy*dy + 1LL*dz*dz;
chux - Reinstate Monica
  • 143,097
  • 13
  • 135
  • 256
  • I am confused. Yes, I made the mistake of the overflow. Which caused a negative lensq (its actual value is -1952511232) but does that mean that the compiler can't deal with integer overflow? As soon as I hit overflow, I am in UB territory? I was not aware of that. So the compiler assumes integers never overflow? – Bram Aug 14 '20 at 00:49
  • 2
    @Bram _signed_ integer overflow is UB. Yes code is in UB-land. Compiler can assume overflow does not occur and `sqrt(lensq)` is _always_ OK in this code. – chux - Reinstate Monica Aug 14 '20 at 00:51
  • The assembly is weird, though. The `je a69` is a test just for zero, suggesting the compiler ruled out a negative result and we have undefined behavior. But the `vucomiss %xmm8,%xmm0` is probably comparing to `FLT_EPSILON` (the contents of `%xmm8` are not shown in the assembly clip), and then the following branch avoids the square root. – Eric Postpischil Aug 14 '20 at 00:58
  • @Bram: Did you test this solution before accepting it? – Eric Postpischil Aug 14 '20 at 00:58
  • @EricPostpischil tested, and fixed! Strangely, I first tried casting the shorts to ints, then multiply, but it still SIGFPE'd. But with the 1LL it works fine. Thanks. – Bram Aug 14 '20 at 01:16
  • 3
    @bram: Casting the shorts to ints does nothing because integer promotion has already done that. The overflow was from the addition of three large `int`s. (The original numbers would have to be in the vicinity of 27,000 or more to cause that overflow, but that's certainly not inconceivable.) – rici Aug 14 '20 at 01:39
  • 2
    @Bram If `int` is 32-bit and interested in some micro-optimization: `0u + (dx*dx) + (dy*dy) + (dz*dz)` to avoid the wide `*`, but still do a slightly wider `+`. – chux - Reinstate Monica Aug 14 '20 at 01:53
  • 1
    though a clever compiler would note that wide multiplication is not needed... – Antti Haapala -- Слава Україні Aug 14 '20 at 04:34