Crazy behaviour during execution

Question

I have been doing some inline-asm with gcc. Everything is ALMOST working, up to some behaviour that is just baffling me. I am evaluating a rational polynomial, but need to use 80-bit constants. The generated code seems to be perfect, but on execution, one of the 80-bit coefficients, when loaded into the fpu, is loaded as 0, even though in memory the byte values are not zero (and I think it is a valid 80-bit real, as the exact same constant loads fine when run with code generated by masm). Here is the output from a gdb session:

(gdb) disassemble
Dump of assembler code for function poly4(double):
0x00402d7c <+0>:     push   %ebp
0x00402d7d <+1>:     mov    %esp,%ebp
0x00402d7f <+3>:     sub    $0x8,%esp
0x00402d82 <+6>:     mov    0x8(%ebp),%eax
0x00402d85 <+9>:     mov    %eax,-0x8(%ebp)
0x00402d88 <+12>:    mov    0xc(%ebp),%eax
0x00402d8b <+15>:    mov    %eax,-0x4(%ebp)
0x00402d8e <+18>:    fld1
0x00402d90 <+20>:    fldl   -0x8(%ebp)
0x00402d93 <+23>:    fmul   %st(0),%st
0x00402d95 <+25>:    fdivrp %st,%st(1)
0x00402d97 <+27>:    fldt   0x40470e
0x00402d9d <+33>:    fadd   %st(1),%st
0x00402d9f <+35>:    fmul   %st(1),%st
0x00402da1 <+37>:    fldt   0x404704
0x00402da7 <+43>:    faddp  %st,%st(1)
0x00402da9 <+45>:    fmul   %st(1),%st
0x00402dab <+47>:    fldt   0x4046fa
0x00402db1 <+53>:    faddp  %st,%st(1)
0x00402db3 <+55>:    fmul   %st(1),%st
0x00402db5 <+57>:    fldt   0x4046f0
0x00402dbb <+63>:    faddp  %st,%st(1)
0x00402dbd <+65>:    fmul   %st(1),%st
=>  0x00402dbf <+67>:    fldt   0x4046e6
0x00402dc5 <+73>:    faddp  %st,%st(1)
...snip....
End of assembler dump.
(gdb) info registers st0 st1 st2 st3 st4 st5
st0            2.7412088761933612e-006  (raw 0x3fecb7f59c22579f9f60)
st1            0.00071574511983807409   (raw 0x3ff4bba0d78724c01468)
st2            <invalid float value>    (raw 0x00077c81cc3b0002021e)
st3            <invalid float value>    (raw 0x00020098007c00f8f0c0)
st4            0        (raw 0x000013af076300003654)
st5            <invalid float value>    (raw 0x0762000000000002021e)
(gdb) x/5xh 0x4046e6
0x4046e6 <_ZL11s_NORMAL_q5>:    0x8996  0xa5d6  0x3d00  0x990a  0x3ff1
(gdb) stepi
0x00402dc5      1577            );
(gdb) info registers st0 st1 st2 st3 st4 st5
st0            0        (raw 0x00000000000000000000)
st1            2.7412088761933612e-006  (raw 0x3fecb7f59c22579f9f60)
st2            0.00071574511983807409   (raw 0x3ff4bba0d78724c01468)
st3            <invalid float value>    (raw 0x00077c81cc3b0002021e)
st4            <invalid float value>    (raw 0x00020098007c00f8f0c0)
st5            0        (raw 0x000013af076300003654)
(gdb) disassemble
Dump of assembler code for function poly4(double):
0x00402d7c <+0>:     push   %ebp
0x00402d7d <+1>:     mov    %esp,%ebp
0x00402d7f <+3>:     sub    $0x8,%esp
0x00402d82 <+6>:     mov    0x8(%ebp),%eax
0x00402d85 <+9>:     mov    %eax,-0x8(%ebp)
0x00402d88 <+12>:    mov    0xc(%ebp),%eax
0x00402d8b <+15>:    mov    %eax,-0x4(%ebp)
0x00402d8e <+18>:    fld1
0x00402d90 <+20>:    fldl   -0x8(%ebp)
0x00402d93 <+23>:    fmul   %st(0),%st
0x00402d95 <+25>:    fdivrp %st,%st(1)
0x00402d97 <+27>:    fldt   0x40470e
0x00402d9d <+33>:    fadd   %st(1),%st
0x00402d9f <+35>:    fmul   %st(1),%st
0x00402da1 <+37>:    fldt   0x404704
0x00402da7 <+43>:    faddp  %st,%st(1)
0x00402da9 <+45>:    fmul   %st(1),%st
0x00402dab <+47>:    fldt   0x4046fa
0x00402db1 <+53>:    faddp  %st,%st(1)
0x00402db3 <+55>:    fmul   %st(1),%st
0x00402db5 <+57>:    fldt   0x4046f0
0x00402dbb <+63>:    faddp  %st,%st(1)
0x00402dbd <+65>:    fmul   %st(1),%st
0x00402dbf <+67>:    fldt   0x4046e6
=>  0x00402dc5 <+73>:    faddp  %st,%st(1)
...snip...
End of assembler dump.
(gdb)

So note: before the stepi, we are about to execute the "fldt 0x4046e6", and a dump of memory at 0x4046e6 clearly shows it is not zero. Yet the "fldt 0x4046e6" results in zero being loaded into st0. All the previous fldt instructions worked fine; the constants are good (the identical code in masm with the same constants works flawlessly). For those interested, here is the source for the constants:

Double80 s_NORMAL_p5 = { 0xE0, 0x14, 0x24, 0x6E, 0x43, 0x6C, 0x37, 0xF4, 0xEF, 0x3F}; // 0x3FEFF4376C436E2414E0, 2.9112874951168791857936318084879e-5
Double80 s_NORMAL_p4 = { 0x74, 0x5B, 0x7C, 0x72, 0xE2, 0x9F, 0x55, 0xBA, 0xF5, 0x3F}; // 0x3FF5BA559FE2727C5B74, 0.0014216191932278934659235163911273
Double80 s_NORMAL_p3 = { 0x3B, 0xD1, 0x83, 0xB3, 0xE8, 0xC1, 0x26, 0xB6, 0xF9, 0x3F}; // 0x3FF9B626C1E8B383D13B, 0.022235277870649807464320442391811
Double80 s_NORMAL_p2 = { 0x4B, 0xA2, 0x6C, 0x9F, 0x32, 0x73, 0x75, 0x82, 0xFC, 0x3F}; // 0x3FFC827573329F6CA24B, 0.1274011611602473638801278160334
Double80 s_NORMAL_p1 = { 0x49, 0xDC, 0x10, 0x22, 0x5C, 0x81, 0x14, 0xDD, 0xFC, 0x3F}; // 0x3FFCDD14815C2210DC49, 0.2158985340579569904693315913281
Double80 s_NORMAL_p0 = { 0x3E, 0xCE, 0xA6, 0x2B, 0xB9, 0x83, 0x04, 0xBD, 0xF9, 0x3F}; // 0x3FF9BD0483B92BA6CE3E, 0.023073441764940173030448369674463

Double80 s_NORMAL_q5 = { 0x96, 0x89, 0xD6, 0xA5, 0x00, 0x3D, 0x0A, 0x99, 0xF1, 0x3F}; // 0x3FF1990A3D00A5D68996, 0.0000729751555083966204509375
Double80 s_NORMAL_q4 = { 0xF8, 0x37, 0xEF, 0xEB, 0x8B, 0x14, 0xE2, 0xF7, 0xF6, 0x3F}; // 0x3FF6F7E2148BEBEF37F8, 0.0037823963320275824448625
Double80 s_NORMAL_q3 = { 0x35, 0xC5, 0x61, 0x91, 0xF0, 0xC9, 0x24, 0x87, 0xFB, 0x3F}; // 0x3FFB8724C9F09161C535, 0.065988137868928551531
Double80 s_NORMAL_q2 = { 0xCC, 0x68, 0x85, 0xAF, 0x42, 0xEB, 0xBC, 0xEF, 0xFD, 0x3F}; // 0x3FFDEFBCEB42AF8568CC, 0.4682382124808651180225
Double80 s_NORMAL_q1 = { 0xF3, 0xDB, 0x06, 0x40, 0x84, 0xA2, 0x62, 0xA4, 0xFF, 0x3F}; // 0x3FFFA462A2844006DBF3, 1.28426009614491121036

and here is the source for the poly4 function:

inline long double poly4(double y)
{
    __asm__(
    "\n\t" "fld1"                       //1
    "\n\t" "fldl %[y]"                      //y, 1
    "\n\t" "fmul %%st(0), %%st(0)"          //y^2, 1
    "\n\t" "fdivp %%st(0), %%st(1)"     //1/y^2=xsq
    //den
    "\n\t" "fldt  %[s_NORMAL_q1]"       //q1, xsq
    "\n\t" "fadd %%st(1), %%st(0)"          //q1+xsq, xsq
    "\n\t" "fmul %%st(1), %%st(0)"          //(q1+xsq)*xsq, xsq
    "\n\t" "fldt  %[s_NORMAL_q2]"           //q2, (q1+xsq)*xsq, xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //q2+(q1+xsq)*xsq, xsq
    "\n\t" "fmul %%st(1), %%st(0)"          //(q2+(q1+xsq)*xsq)*xsq, xsq
    "\n\t" "fldt  %[s_NORMAL_q3]"           //q3, (q2+(q1+xsq)*xsq)*xsq, xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //q3+(q2+(q1+xsq)*xsq)*xsq, xsq
    "\n\t" "fmul %%st(1), %%st(0)"          //(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq, xsq
    "\n\t" "fldt  %[s_NORMAL_q4]"       //q4, (q3+(q2+(q1+xsq)*xsq)*xsq)*xsq, xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq, xsq
    "\n\t" "fmul %%st(1), %%st(0)"          //(q4+(q3+(q2+(q1+                xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fldt  %[s_NORMAL_q5]"           //q5, (q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    //num
    "\n\t" "fldt  %[s_NORMAL_p0]"       //p0, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fmul %%st(2), %%st(0)"          //p0*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fldt  %[s_NORMAL_p1]"           //p1, p0*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //p1+p0*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fmul %%st(2), %%st(0)"          //(p1+p0*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fldt  %[s_NORMAL_p2]"           //p2, (p1+p0*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //p2+(p1+p0*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fmul %%st(2), %%st(0)"          //(p2+(p1+p0*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fldt  %[s_NORMAL_p3]"       //p3, (p2+(p1+p0*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //p3+(p2+(p1+p0*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fmul %%st(2), %%st(0)"          //(p3+(p2+(p1+p0*xsq)*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fldt  %[s_NORMAL_p4]"       //p4, (p3+(p2+(p1+p0*xsq)*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //p4+(p3+(p2+(p1+p0*xsq)*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fmul %%st(2), %%st(0)"          //(p4+(p3+(p2+(p1+p0*xsq)*xsq)*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fldt  %[s_NORMAL_p5]"       //p5, (p4+(p3+(p2+(p1+p0*xsq)*xsq)*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "faddp %%st(0), %%st(1)"     //p5+(p4+(p3+(p2+(p1+p0*xsq)*xsq)*xsq)*xsq)*xsq, q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq), xsq
    "\n\t" "fmulp %%st(0), %%st(2)"     //q5+(q4+(q3+(q2+(q1+xsq)*xsq)*xsq)*xsq)=den, (p5+(p4+(p3+(p2+(p1+p0*xsq)*xsq)*xsq)*xsq)*xsq)*xsq=num
    // num/den
    "\n\t" "fdivp %%st(0), %%st(1)"     //num/den
    "\n\t" "fldt  %[s_oneOverRootTwoPi]"    // oneOverRootTwoPi, num/den
    "\n\t" "fsubrp %%st(0), %%st(1)"        //oneOverRootTwoPi - num/den
    "\n\t" "fldl %[y]"                      //y, (oneOverRootTwoPi - num/den)
    "\n\t" "fdiv %%st(0), %%st(1)"          //y, (oneOverRootTwoPi - num/den)/y

    "\n\t" "sub $8, %%esp" 
    "\n\t" "fstpl  (%%esp)"             //(oneOverRootTwoPi - num/den)/y
    "\n\t" "call (%P[exp_X2_2])" 
    "\n\t" "add $8, %%esp"
    "\n\t" "fmulp %%st(0), %%st(1)" 

    "\n\t" "leave"
    "\n\t" "ret"
    : 
    : [y] "m" (y)
    , [s_oneOverRootTwoPi] "m" (*s_oneOverRootTwoPi)
    , [s_NORMAL_p0] "m" (*s_NORMAL_p0)
    , [s_NORMAL_p1] "m" (*s_NORMAL_p1)
    , [s_NORMAL_p2] "m" (*s_NORMAL_p2)
    , [s_NORMAL_p3] "m" (*s_NORMAL_p3)
    , [s_NORMAL_p4] "m" (*s_NORMAL_p4)
    , [s_NORMAL_p5] "m" (*s_NORMAL_p5)
    , [s_NORMAL_q1] "m" (*s_NORMAL_q1)
    , [s_NORMAL_q2] "m" (*s_NORMAL_q2)
    , [s_NORMAL_q3] "m" (*s_NORMAL_q3)
    , [s_NORMAL_q4] "m" (*s_NORMAL_q4)
    , [s_NORMAL_q5] "m" (*s_NORMAL_q5)
    , [exp_X2_2] "i" (exp_X2_2)
    : 
    );
}

This is the state of the FPU immediately prior to the ineffective load:

(gdb) info float
  R7: Valid   0x3ff4bba0d78724c01468 +0.00071574511983807409
=>R6: Valid   0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
  R5: Empty   0x3ff6f7e2148bebef37f8
  R4: Empty   0x000000020a0d00000007
  R3: Empty   0xf1be000000000002021e
  R2: Empty   0x00001697f1bf00003654
  R1: Empty   0x00020098007c00f8f0c0
  R0: Empty   0x00077c81cc3b0002021e

Status Word:         0xffff3320                  PE             C0 C1
                       TOP: 6
Control Word:        0xffff037f   IM DM ZM OM UM PM
                       PC: Extended Precision (64-bits)
                       RC: Round to nearest
Tag Word:            0xffff0fff
Instruction Pointer: 0x1b:0x00402dbd
Operand Pointer:     0xffff0023:0x004046f0
Opcode:              0xd8c9

I am wondering what is the meaning of the "C1" flag in the status word above - I cannot find documentation on this. This is the state immediately after the failed fldt (executed by the stepi):

(gdb) stepi
0x00402dc5      1485            );
(gdb) info float
  R7: Valid   0x3ff4bba0d78724c01468 +0.00071574511983807409
  R6: Valid   0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
=>R5: Zero    0x00000000000000000000 +0
  R4: Empty   0x000000020a0d00000007
  R3: Empty   0xf1be000000000002021e
  R2: Empty   0x00001697f1bf00003654
  R1: Empty   0x00020098007c00f8f0c0
  R0: Empty   0x00077c81cc3b0002021e

Status Word:         0xffff2920                  PE             C0
                       TOP: 5
Control Word:        0xffff037f   IM DM ZM OM UM PM
                       PC: Extended Precision (64-bits)
                       RC: Round to nearest
Tag Word:            0xffff07ff
Instruction Pointer: 0x1b:0x00402dbf
Operand Pointer:     0xffff0023:0x0040cce6
Opcode:              0xdb2d

OK, I have now modified the code so that the instructions at 0x00402db5 and 0x00402dbf are identical. The first succeeds, the second fails. Here is a gdb session showing the disassembled code, and the fpu state immediately before execution of the two identical instructions. The only significant difference in the state is the presence of the C1 flag in the status prior to the execution of the second fldt instruction:

(gdb) disassemble
Dump of assembler code for function poly4(double):
   0x00402d7c <+0>:     push   %ebp
   0x00402d7d <+1>:     mov    %esp,%ebp
   0x00402d7f <+3>:     sub    $0x8,%esp
   0x00402d82 <+6>:     mov    0x8(%ebp),%eax
   0x00402d85 <+9>:     mov    %eax,-0x8(%ebp)
   0x00402d88 <+12>:    mov    0xc(%ebp),%eax
   0x00402d8b <+15>:    mov    %eax,-0x4(%ebp)
=> 0x00402d8e <+18>:    fld1
   0x00402d90 <+20>:    fldl   -0x8(%ebp)
   0x00402d93 <+23>:    fmul   %st(0),%st
   0x00402d95 <+25>:    fdivrp %st,%st(1)
   0x00402d97 <+27>:    fldt   0x40470e
   0x00402d9d <+33>:    fadd   %st(1),%st
   0x00402d9f <+35>:    fmul   %st(1),%st
   0x00402da1 <+37>:    fldt   0x404704
   0x00402da7 <+43>:    faddp  %st,%st(1)
   0x00402da9 <+45>:    fmul   %st(1),%st
   0x00402dab <+47>:    fldt   0x4046fa
   0x00402db1 <+53>:    faddp  %st,%st(1)
   0x00402db3 <+55>:    fmul   %st(1),%st
   0x00402db5 <+57>:    fldt   0x4046f0
   0x00402dbb <+63>:    faddp  %st,%st(1)
   0x00402dbd <+65>:    fmul   %st(1),%st
   0x00402dbf <+67>:    fldt   0x4046f0
   0x00402dc5 <+73>:    faddp  %st,%st(1)
   0x00402dc7 <+75>:    fldt   0x4046dc
   0x00402dcd <+81>:    fmul   %st(2),%st
   0x00402dcf <+83>:    fldt   0x4046d2
   0x00402dd5 <+89>:    faddp  %st,%st(1)
   0x00402dd7 <+91>:    fmul   %st(2),%st
   0x00402dd9 <+93>:    fldt   0x4046c8
   0x00402ddf <+99>:    faddp  %st,%st(1)
   0x00402de1 <+101>:   fmul   %st(2),%st
   0x00402de3 <+103>:   fldt   0x4046be
   0x00402de9 <+109>:   faddp  %st,%st(1)
   0x00402deb <+111>:   fmul   %st(2),%st
   0x00402ded <+113>:   fldt   0x4046b4
   0x00402df3 <+119>:   faddp  %st,%st(1)
   0x00402df5 <+121>:   fmul   %st(2),%st
   0x00402df7 <+123>:   fldt   0x4046aa
   0x00402dfd <+129>:   faddp  %st,%st(1)
   0x00402dff <+131>:   fmulp  %st,%st(2)
   0x00402e01 <+133>:   fdivrp %st,%st(1)
   0x00402e03 <+135>:   fldt   0x40408e
   0x00402e09 <+141>:   fsubrp %st,%st(1)
   0x00402e0b <+143>:   fldl   -0x8(%ebp)
   0x00402e0e <+146>:   fdivr  %st,%st(1)
   0x00402e10 <+148>:   sub    $0x8,%esp
   0x00402e13 <+151>:   fstpl  (%esp)
   0x00402e16 <+154>:   fwait
   0x00402e17 <+155>:   call   0x4013c0 <exp_X2_2(double)>
   0x00402e1c <+160>:   add    $0x8,%esp
   0x00402e1f <+163>:   fmulp  %st,%st(1)
   0x00402e21 <+165>:   fstl   0x406020
   0x00402e27 <+171>:   fld    %st(0)
   0x00402e29 <+173>:   fsubl  0x406020
   0x00402e2f <+179>:   fildll 0x403020
   0x00402e35 <+185>:   fmulp  %st,%st(1)
   0x00402e37 <+187>:   fstpl  0x406020
   0x00402e3d <+193>:   leave
   0x00402e3e <+194>:   ret
   0x00402e3f <+195>:   flds   0x40472c
   0x00402e45 <+201>:   leave
   0x00402e46 <+202>:   ret
End of assembler dump.
(gdb) tbreak *0x00402db5
Temporary breakpoint 61 at 0x402db5: file cody2.cpp, line 1489.
(gdb) continue
Continuing.

Temporary breakpoint 61, 0x00402db5 in poly4 (y=37.37840817302294) at cody2.cpp:1489
1489            );
(gdb) info float
  R7: Valid   0x3ff4bba0d78724c01468 +0.00071574511983807409
=>R6: Valid   0x3ff0c71ba235b8f6a603 +4.7471033066735141e-005
  R5: Empty   0x3ffb8724c9f09161c535
  R4: Empty   0xf13d00000a0d00000007
  R3: Empty   0x07ec000000000002021e
  R2: Empty   0x000016cbc40900003654
  R1: Empty   0x00020098007c00f8f0c0
  R0: Empty   0x00077c81cc3b0002021e

Status Word:         0xffff3120                  PE             C0
                       TOP: 6
Control Word:        0xffff037f   IM DM ZM OM UM PM
                       PC: Extended Precision (64-bits)
                       RC: Round to nearest
Tag Word:            0xffff0fff
Instruction Pointer: 0x1b:0x00402db3
Operand Pointer:     0xffff0023:0x004046fa
Opcode:              0xd8c9
(gdb) stepi
0x00402dbb      1489            );
(gdb) info float
  R7: Valid   0x3ff4bba0d78724c01468 +0.00071574511983807409
  R6: Valid   0x3ff0c71ba235b8f6a603 +4.7471033066735141e-005
=>R5: Valid   0x3ff6f7e2148bebef37f8 +0.0037823963320275824
  R4: Empty   0xf13d00000a0d00000007
  R3: Empty   0x07ec000000000002021e
  R2: Empty   0x000016cbc40900003654
  R1: Empty   0x00020098007c00f8f0c0
  R0: Empty   0x00077c81cc3b0002021e

Status Word:         0xffff2920                  PE             C0
                       TOP: 5
Control Word:        0xffff037f   IM DM ZM OM UM PM
                       PC: Extended Precision (64-bits)
                       RC: Round to nearest
Tag Word:            0xffff03ff
Instruction Pointer: 0x1b:0x00402db5
Operand Pointer:     0xffff0023:0x004046f0
Opcode:              0xdb2d
(gdb) stepi
0x00402dbd      1489            );
(gdb) stepi
0x00402dbf      1489            );
(gdb) info float
  R7: Valid   0x3ff4bba0d78724c01468 +0.00071574511983807409
=>R6: Valid   0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
  R5: Empty   0x3ff6f7e2148bebef37f8
  R4: Empty   0xf13d00000a0d00000007
  R3: Empty   0x07ec000000000002021e
  R2: Empty   0x000016cbc40900003654
  R1: Empty   0x00020098007c00f8f0c0
  R0: Empty   0x00077c81cc3b0002021e

Status Word:         0xffff3320                  PE             C0 C1
                       TOP: 6
Control Word:        0xffff037f   IM DM ZM OM UM PM
                       PC: Extended Precision (64-bits)
                       RC: Round to nearest
Tag Word:            0xffff0fff
Instruction Pointer: 0x1b:0x00402dbd
Operand Pointer:     0xffff0023:0x004046f0
Opcode:              0xd8c9
(gdb) stepi
0x00402dc5      1489            );
(gdb) info float
  R7: Valid   0x3ff4bba0d78724c01468 +0.00071574511983807409
  R6: Valid   0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
=>R5: Zero    0x00000000000000000000 +0
  R4: Empty   0xf13d00000a0d00000007
  R3: Empty   0x07ec000000000002021e
  R2: Empty   0x000016cbc40900003654
  R1: Empty   0x00020098007c00f8f0c0
  R0: Empty   0x00077c81cc3b0002021e

Status Word:         0xffff2920                  PE             C0
                       TOP: 5
Control Word:        0xffff037f   IM DM ZM OM UM PM
                       PC: Extended Precision (64-bits)
                       RC: Round to nearest
Tag Word:            0xffff07ff
Instruction Pointer: 0x1b:0x00402dbf
Operand Pointer:     0xffff0023:0x0040ccf0
Opcode:              0xdb2d
(gdb)

OK, I am so completely baffled. The code generated by masm and gcc is identical (except address of course) masm: db 2d 24 69 41 00, gdb: 0xdb 0x2d 0xe6 0x46 0x40 0x00 - same op code, 0x2ddb. And data at the target address is identical, yet the masm code behaves as expected, the gdb code loads zero. — David I. McIntosh, May 09 '11 at 02:45
Data at the target addresses - masm: 0x00416924 96 89 d6 a5 00 3d 0a 99 f1 3f, gdb: 0x4046e6 <_ZL11s_NORMAL_q5>: 0x8996 0xa5d6 0x3d00 0x990a 0x3ff1 — David I. McIntosh, May 09 '11 at 02:54
OK, I have manually looked at the byte-codes generated by MASM and by the GNU inline assembler, and for the inline assembly part, the byte codes are identical, and the state of the CPU coming in to the routine seems identical. Yet when executing the code under gdb, the second load fails, and in the Microsoft debugger, all is well. This makes no sense. — David I. McIntosh, May 09 '11 at 18:57
Well, it turns out the code executes just fine when executed outside of GDB, so GDB is somehow screwing up the code so that it does not execute as it would outside GDB. Anybody care to look? Its a rather small, self-contained problem, only two files. — David I. McIntosh, May 09 '11 at 19:02
Since this is looking suspiciously like a GDB bug, I have to ask: what version of GDB are you using? — SamB, May 09 '11 at 21:04
You have dozens of lines of code but your problem is that a constant is not loading correctly. You should simplify your repro to the minimum amount of code needed to show the problem. This shows respect for the time of the people you are asking for help, makes it more likely that they can/will help, and often results in you finding more about the problem. — Bruce Dawson, Mar 04 '15 at 16:39

score 1 · Answer 1 · answered May 09 '11 at 01:58

1

I note that in the debugger, the entries on the stack below the valid entries are marked invalid float value:

st2  <invalid float value>    (raw 0x00077c81cc3b0002021e)
st3  <invalid float value>    (raw 0x00020098007c00f8f0c0)
st4  0                        (raw 0x000013af076300003654)
st5  <invalid float value>    (raw 0x0762000000000002021e)

This leads me to believe that your x87 stack may be corrupted before this routine begins executing. This could be either because you have overflowed the stack, or another routine has used MMX instructions without issuing the requiredemms instruction to restore the floating-point state. Break at entry to your routine and put info float in the gdb, and report the results.

answered May 09 '11 at 01:58

Stephen Canon

103,815
19
183
269

Hi Stephen. Thanks for your comments. Yes, there must be something bizzare about the state of the FPU, but I cannot figure out what the issue might be. The state of the FPU at the start of the call: (gdb) info float R7: Empty 0x400495837d6e9eb0c800 R6: Empty 0x4001b504f333f9de6800 R5: Empty 0xefd07c81cbfe00f8f08c R4: Empty 0xf13d00000a0d00000007 R3: Empty 0x07ec000000000002021e R2: Empty 0x00001687528700003654 R1: Empty 0x00020098007c00f8f0c0 =>R0: Empty 0x00077c81cc3b0002021e – David I. McIntosh May 09 '11 at 14:07
Status Word: 0xffff0100 C0 TOP: 0 Control Word: 0xffff037f IM DM ZM OM UM PM PC: Extended Precision (64-bits) RC: Round to nearest Tag Word: 0xffffffff Instruction Pointer: 0x1b:0x00401580 Operand Pointer: 0xffff0023:0x0022fd80 Opcode: 0xdae9 – David I. McIntosh May 09 '11 at 14:10
Note that the fpu stack is completely empty. Even more baffling: if I change the code above so that the "fldt %[s_NORMAL_q5]" changes to "fldt %[s_NORMAL_q4]", i.e. is _identical_ to the previous fldt command, the second load fails. And note that at the time of both fldt instructions, st(7) contains the discarded value from the previous fmulp instruction, so there should be no issue with a non-empty stack element. – David I. McIntosh May 09 '11 at 14:19
Arrrrggggg! If I put a simple "fnop" before the second (not-working) fldt instruction, it completely goes hay-wire. FPU state after fldt is now :(gdb) info float R7: Valid 0x3ff4bba0d78724c01468 +0.00071574511983807409 =>R6: Special 0xffffc000000000000000 Real Indefinite (QNaN) R5: Empty 0x3ff6f7e2148bebef37f8 R4: Empty 0xf13d00000a0d00000007 R3: Empty 0x07ec000000000002021e R2: Empty 0x000016d76df400003654 R1: Empty 0x00020098007c00f8f0c0 R0: Empty 0x00077c81cc3b0002021e – David I. McIntosh May 09 '11 at 15:36
Status Word: 0xffff3161 IE PE SF C0 TOP: 6 Control Word: 0xffff037f IM DM ZM OM UM PM PC: Extended Precision (64-bits) RC: Round to nearest Tag Word: 0xffff2fff Instruction Pointer: 0x1b:0x00402dc1 Operand Pointer: 0xffff0023:0x004046f0 Opcode: 0xdbcc – David I. McIntosh May 09 '11 at 15:36
Arrgggg. This seems to be a timing issue. If I move the "fwait" in the (last) code snippet above, things get really screwy - if I move the "fwait" from after the "sub $0x8,%esp" "fstpl (%esp)" to before these two instructions, the second fldt instruction now loads absolute garbage, and gdb seems to become very confused (e.g. it cannot handle/display the garbage-but-valid value, 0x2210dc493ffc82757332, loaded into the fpu register._ – David I. McIntosh May 09 '11 at 15:56
If I place a "fclex" before the fldt that fails, it suddenly starts working, yet the only exception status that is set (and thus the only one that is cleared by the fclex instruction) is the precision exception (PE). Why would PE=1 in this case cause the load to fail? In most of the other fldt instructions, the PE is also set at the time they are executed, yet they succeed just fine. I am still baffled. – David I. McIntosh May 09 '11 at 17:12
That is bad. Do I need to put fclex before each fldt? Why does the IDENTICAL code generated by masm (identical except for addresses) work flawlessly? I cannot see anything wrong with the code being generated by the gnu inline assembler. – David I. McIntosh May 09 '11 at 17:15

Crazy behaviour during execution

1 Answers1