6

Modulus of short integers not correct. This is really weird, and already cost me two days. I've narrowed the problematic code as follows (simplified as far as possible):

#include <stdio.h>
#include <stdlib.h>

int foo(short Width, short Height, short MSize) 
{
    short i = 0, k = 0, pos = 0;
    short j = 0;

    for(j = 1; j < Width - 1; j = j + 1)
    {/* a blank loop */}

    for(i = 1; i < Height - 1; i = i + 1) {
        for(j = 1; j < Width - 1; j = j + 1) {
            if((j % MSize) == 0) {
                k = k + 1;
            }
            printf("i=%d, k=%d, j=%d, MSize=%d, j mod MSize=%d\n", (int)i, (int)k, (int)j, (int)MSize, (int)(j % MSize));
            if (pos >= 1024) {
                fprintf(stderr, "pos = %d, over 1024\n", (int)pos);
            }
            pos = pos + 1;
        }
    }
    return 0;
}

int main(int argc, char* argv[])
{
    foo(32, 32, 8);
    return 0;
}

When compiled in Debug mode, the above codes work fine, the result of j%MSize is correct, however, when compiled in Release mode, the result of j%MSize will always be 7, which is nonsense (tested under Visual Studio 2005/2012/2013). There is no memory operation, so it should not be caused by stack corruption. Anybody has a clue?

The output I'm seeing is (slightly edited):

j=10, MSize=8, j mod MSize=7
j=11, MSize=8, j mod MSize=7
j=12, MSize=8, j mod MSize=7
j=13, MSize=8, j mod MSize=7
j=14, MSize=8, j mod MSize=7
j=15, MSize=8, j mod MSize=7
j=16, MSize=8, j mod MSize=7
j=17, MSize=8, j mod MSize=7
j=18, MSize=8, j mod MSize=7
j=19, MSize=8, j mod MSize=7
j=20, MSize=8, j mod MSize=7
j=21, MSize=8, j mod MSize=7
j=22, MSize=8, j mod MSize=7
j=23, MSize=8, j mod MSize=7
j=24, MSize=8, j mod MSize=7
j=25, MSize=8, j mod MSize=7
j=26, MSize=8, j mod MSize=7
j=27, MSize=8, j mod MSize=7

The following is the build log:

 1>Project "E:\Code\workspace\C\GeneralC\SNDFeatureExtract\SNDFeatureExtract.vcxproj" on node 2 (Build target(s)).

 1>ClCompile:

     D:\Program Files\Microsoft Visual Studio 11.0\VC\bin\CL.exe /c /Zi /nologo /W3 /WX- /sdl /O2 /Oi /Oy- /GL /D WIN32 /D NDEBUG /D _CONSOLE /D _MBCS /Gm- /EHsc /MT /GS /Gy /fp:precise /Zc:wchar_t /Zc:forScope /Fo"Release\\" /Fd"Release\vc110.pdb" /Gd /TP /analyze- /errorReport:prompt WeirdBug.cpp

     WeirdBug.cpp

   Link:

     D:\Program Files\Microsoft Visual Studio 11.0\VC\bin\link.exe /ERRORREPORT:PROMPT /OUT:"E:\Code\workspace\C\GeneralC\Release\SNDFeatureExtract.exe" /INCREMENTAL:NO /NOLOGO kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /MANIFEST /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /manifest:embed /DEBUG /PDB:"E:\Code\workspace\C\GeneralC\Release\SNDFeatureExtract.pdb" /SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /LTCG /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"E:\Code\workspace\C\GeneralC\Release\SNDFeatureExtract.lib" /MACHINE:X86 /SAFESEH Release\WeirdBug.obj

     Generating code

     Finished generating code

     SNDFeatureExtract.vcxproj -> E:\Code\workspace\C\GeneralC\Release\SNDFeatureExtract.exe

 1>Done Building Project "E:\Code\workspace\C\GeneralC\SNDFeatureExtract\SNDFeatureExtract.vcxproj" (Build target(s)).

The following is the Disassembly result from VS:

    short i = 0, k = 0, pos = 0;
    short j = 0;

    for(j = 1; j < Width - 1; j = j + 1)
00801014  mov         edi,1FF983C8h  
00801019  jl          foo+12h (0801012h)  
    {/* a blank loop */}

    for(i = 1; i < Height - 1; i = i + 1) {
0080101B  mov         edx,1  
00801020  mov         dword ptr [ebp-4],1  
00801027  mov         dword ptr [ebp-8],edx  
0080102A  and         ecx,80000007h  
00801030  jns         foo+37h (0801037h)  
00801032  dec         ecx  
00801033  or          ecx,0FFFFFFF8h  
00801036  inc         ecx  
00801037  mov         dword ptr [ebp-0Ch],ecx  
0080103A  lea         ebx,[ebx]  
00801040  mov         eax,1  
        for(j = 1; j < Width - 1; j = j + 1) {
00801045  mov         ebx,eax  
            if((j % MSize) == 0) {
00801047  test        ecx,ecx  
00801049  jne         foo+4Ch (080104Ch)  
                k = k + 1;
0080104B  inc         edi  
            }
            printf_s("i=%d, k=%d, j=%d, MSize=%d, j mod MSize=%d\n", (int)i, (int)k, (int)j, (int)MSize, (int)(j % MSize));
0080104C  push        ecx  
0080104D  push        8  
0080104F  push        eax  
00801050  movsx       eax,di  
00801053  push        eax  
00801054  push        edx  
00801055  push        80CD30h  
0080105A  call        printf_s (0801266h)  
            if (pos >= 1024) {
0080105F  mov         eax,400h  
00801064  add         esp,18h  
00801067  cmp         si,ax  
0080106A  jl          foo+86h (0801086h)  
                fprintf_s(stderr, "pos = %d, over 1024\n", (int)pos);
0080106C  movsx       eax,si  
                fprintf_s(stderr, "pos = %d, over 1024\n", (int)pos);
0080106F  push        eax  
00801070  push        80CD5Ch  
00801075  call        __iob_func (0801175h)  
0080107A  add         eax,40h  
0080107D  push        eax  
0080107E  call        fprintf_s (080127Ch)  
00801083  add         esp,0Ch  
        for(j = 1; j < Width - 1; j = j + 1) {
00801086  mov         ecx,dword ptr [ebp-0Ch]  
00801089  mov         edx,dword ptr [ebp-8]  
            }
            pos = pos + 1;
0080108C  inc         ebx  
0080108D  movsx       eax,bx  
00801090  inc         esi  
00801091  cmp         eax,1Fh  
00801094  jl          foo+47h (0801047h)  
    {/* a blank loop */}

    for(i = 1; i < Height - 1; i = i + 1) {
00801096  mov         eax,dword ptr [ebp-4]  
00801099  inc         eax  
0080109A  movsx       edx,ax  
0080109D  mov         dword ptr [ebp-4],eax  
008010A0  mov         dword ptr [ebp-8],edx  
008010A3  cmp         edx,1Fh  
008010A6  jl          foo+40h (0801040h)  
        }
    }
    return 0;
008010A8  pop         edi  
008010A9  pop         esi  
008010AA  xor         eax,eax  
008010AC  pop         ebx  
}
008010AD  mov         esp,ebp  
008010AF  pop         ebp  
008010B0  ret  
Jedi
  • 882
  • 1
  • 8
  • 24
  • 3
    Can you present a more concise example. Is `{/* a blank loop */}` really necessary. – this Apr 02 '14 at 05:22
  • i, j, k, pos intialize these variables first then use them – Ali Kazmi Apr 02 '14 at 05:23
  • 2
    @AliKazmi: read the core more carefully - they are properly initialized – lisyarus Apr 02 '14 at 05:24
  • Problem remains even all variables are initialized to 0 – Jedi Apr 02 '14 at 05:27
  • Can you paste in the output from the print lines? – Tony Delroy Apr 02 '14 at 05:31
  • The blank loop is functionally unnecessary, but necessary to repeat the problem. If the blank loop is removed, the above codes work fine, even in release mode. So, it's really weird – Jedi Apr 02 '14 at 05:31
  • 1
    [Cannot reproduce](http://rextester.com/ZTCJ97180). – n. m. could be an AI Apr 02 '14 at 05:41
  • This could be a compiler bug in VS caused by optimization (like loop unrolling). In general it's good practice to declare your iterators (`i`, `j`, ...) inside the loops scope. I guess that the problem will go away if you declare `j` in the head of the for-loop. Can't verify this at the moment though. – Excelcius Apr 02 '14 at 05:42
  • @n.m.: the difference between Debug/Release suggests that compiler flags could be relevant - do the flags in your test match those used by the compiler in Release Mode? – DCoder Apr 02 '14 at 05:51
  • @DCode I have tried the default flags used by the online compiler, and also triwd wit /Ox. No idea what flags the VS IDE uses. – n. m. could be an AI Apr 02 '14 at 05:55
  • @Jedi: I cannot reproduce this issue, just like n.m. couldn't. Can you find a log file in the `Release/` directory and amend your post with its contents? It should contain a record of when the project was built and what compiler/linker flags were used. That way we can more accurately reproduce your environment and maybe recreate the problem. – DCoder Apr 02 '14 at 06:20
  • @DCoder: I've added the build log. The codes are built with VS's default settings. You can test with your environment. – Jedi Apr 02 '14 at 06:31
  • The culprit is probably the whole program optimization flag (/GL). If you turn it off this should fix the problem. – n. m. could be an AI Apr 02 '14 at 06:33
  • @n.m.: yes, when whole program optimization is turned off, the modulus result is correct. So what is this? a compiler bug? – Jedi Apr 02 '14 at 06:43
  • Looks like a compiler bug to me. – n. m. could be an AI Apr 02 '14 at 06:48
  • 3
    I've opened a bug on Microsoft Connect: https://connect.microsoft.com/VisualStudio/feedback/details/845052/optimization-bug-with-short-data-types-loops-and-whole-program-optimzation-gl – Michael Burr Apr 02 '14 at 07:28
  • @Jedi: your mission, if you accept it, is to try and reduce the hell out of this program. That is, try to remove other parts of the function, reduce the value of the inputs, remove some `printf`, etc... and, of course, still get the bug reproduced. In general, the smaller the program gets the easier it will be to understand the issue (yes, even the compiler issues). – Matthieu M. Apr 02 '14 at 07:34
  • 1
    @MatthieuM.: I've posted an attachment to the Connect bug that is slightly smaller; it has about 10 lines and 4 variables removed. It seems that any small change from that form causes the problem to disappear. – Michael Burr Apr 02 '14 at 07:47
  • @MichaelBurr: that's the issue with compiler bugs, they are most often triggered by such a weird set of conditions that manual reduction is painful. In general, the smallest example is only achieved *after* having understood what within the compiler triggered the bug :( – Matthieu M. Apr 02 '14 at 07:52
  • 1
    @MatthieuM.: I've reduced the codes as far as possible. The post already re-editted. – Jedi Apr 02 '14 at 08:01
  • Does the problem go away when you use `((int)j % MSize)` instead of `(int)(j % MSize))` in both lines where you use this? If the problem is caused by `j` being `short`, this may well solve it. As far as efficiency, it should be slightly faster (!) to do the `mod` operation on an integer. – Jongware Apr 02 '14 at 08:38
  • @Jongware: No. The point is what's wrong here, not a temporary solution. – Jedi Apr 02 '14 at 09:07
  • Sure, but it could have narrowed the problem down to the implementation of `mod` with `int` versus `short`. Are you capable of producing assembler code with VS? – Jongware Apr 02 '14 at 10:15
  • @Jongware: don't the rules of C implicitly cast any "small" integral to `int` before performing operations on it anyway ? – Matthieu M. Apr 02 '14 at 12:05
  • @MatthieuM.: I don't *think* the C rules says it should. 'Modulus' is a basic operation, just like '+' and '-', and when doing maths with shorts or chars, it's the programmer's responsibility to watch out for overflows; the compiler doesn't care. Perhaps there is something concrete in the C standard about this, but I doubt it. – Jongware Apr 02 '14 at 13:01
  • @Jongware: I believe it does (quick Internet search brings the opaque reference 6.3.1.8), all basic operations on small integers (`char`, `signed char`, `unsigned char`, `short`, `unsigned short`) start by first converting the arguments to `int` before operating. Therefore `char a, b; static_assert(sizeof(a + b) == sizeof(int), "");` will compile. The only question, thus, is whether this promotion to `int` also applies for *modulus* or not, and I don't have a C Standard at hand :( – Matthieu M. Apr 02 '14 at 13:27
  • @Jongware: Disassembly result from VS is added. – Jedi Apr 03 '14 at 02:11

3 Answers3

8

it is because the compiler's optimization, and this is something todo with your blank loop. But I'm not quite sure where the problem is.

To simply solve the question, declare j as:

  volatile short j;

and it will works fine. Cause program will fetch j from memory instead of registers every time.

I debugged the assembly code, and find out the program calculate the j % MSize and stored it into memory just after the blank loop, and every time before doing printf, it just fetch the value from memory instead of re-calculating it.

mov         ecx,dword ptr [ebp-10h] // j % MSize    @ memory
push        ecx  // j % MSize
mov         ecx,dword ptr [ebp-0Ch]  
push        8  // MSize
push        eax  // j
movsx       eax,word ptr [IdxY]  
movsx       esi,di  
push        esi  // k
push        eax  // IdxY
push        ecx  // i
// push static string and calling printf

But adding a volatile, it will be act like:

mov         dx,word ptr [j]  
movsx       eax,dx  // j
and         eax,80000007h  // j % 8
push        eax 
// push other vars and calling printf

That's re-calculating the MOD, and push it into stack for printf. So it's likely a compiler's bug, because it should fetch j from memory even if there is no volatile add.

Since I can't add comments again now :(.. I found out it's the /Oxxx and /GL flag's fault. It will choose one from below:

/O1 /O2 /Ox

It has to choose one of the above options along with /GL to see the issue.

My IDE is Visual Studio 2010 10.0.40219.1 SP1Rel

Jim Yang
  • 479
  • 2
  • 12
  • 3
    If you're not sure where the problem is, why do you think the blank loop is causing it? And adding `volatile` is a kludge, plain and simple, not a solution :-) – paxdiablo Apr 02 '14 at 05:48
  • It worked as charm! So, a further question: when should I declare a variable this way to prevent such kind of problem? – Jedi Apr 02 '14 at 05:50
  • 1
    @paxdiablo The author of the question said in one of the comments that the problem doesn't show up when the empty loop is removed, so that assumption is probably correct. However `volatile` is certainly not the way to go, an empty loop doesn't make any sense and if you fill it with useful code the bug will probably vanish as well. – Excelcius Apr 02 '14 at 05:51
  • @paxdiablo : Because when I removed the loop, it worked fine. And i can't add comment before he took my answer lol.. – Jim Yang Apr 02 '14 at 05:52
  • 1
    @Jedi It is usually used in multi-threading programs to avoid 'unexpected' change of a var. But it is weird that happens here. Just like *paxdiablo* says, it's not the final solution, it's just a tricky way to solve the problem. – Jim Yang Apr 02 '14 at 05:55
  • `volatile` inhibits optimisations. Not really surprising. – n. m. could be an AI Apr 02 '14 at 05:57
  • @n.m. that's right, but we still not found out why. – Jim Yang Apr 02 '14 at 05:59
  • @Jim Yang: yes you're right, so I think it's better label the question unanswered for more discussions here – Jedi Apr 02 '14 at 06:01
  • @Jedi It should be :P – Jim Yang Apr 02 '14 at 06:03
  • Another couple data points: I have to use "Whole Program Optimization" (`/GL`) along with `/O2` (or `/Ox`) to get the problem to occur. Also, it occurs for both x86 and x64 targets, which surprised me a little. – Michael Burr Apr 02 '14 at 06:37
  • @Jedi I think this is a nice answer which clearly shows what the program is doing wrong, but I don't consider this to be a real solution in your simple case. I have added a new answer which solves the problem as well without the use of `volatile` or a change of optimization settings. – Excelcius Apr 02 '14 at 07:33
  • @JimYang: I don't see the problem with just `/O1`, `/O2`, or `/Ox`. I have to use one of those options along with `/GL` to repro the problem. – Michael Burr Apr 02 '14 at 07:52
  • @MichaelBurr No(i'm not sure i should say yes or no here, because my English is poor.). I can only see the issue when use one of those options along with /GL. – Jim Yang Apr 02 '14 at 07:55
  • 2
    Looking at the necessary conditions so far, I assume that the error is caused by a combination of Common Subexpression Elimination and code movement. "Reloading a value" is the correct optimization for a true Common Subexpression, but that of course requires first calculating and storing that CSE. And the optimizer needs to find a place for that. It looks like the CSE calculation is incorrectly hoisted out of the loop **iff** there's an empty loop before it. – MSalters Apr 02 '14 at 11:05
2

I don't see any issue

$ gcc modulus.c 
$ ./a.out 
Width = 32, Height = 32, MSize = 8, Dim =16, sizeof(short)=2
i=1, IdxY=0, k=0, j=1, MSize=8, j mod MSize=1
i=1, IdxY=0, k=0, j=2, MSize=8, j mod MSize=2
i=1, IdxY=0, k=0, j=3, MSize=8, j mod MSize=3
i=1, IdxY=0, k=0, j=4, MSize=8, j mod MSize=4
i=1, IdxY=0, k=0, j=5, MSize=8, j mod MSize=5
i=1, IdxY=0, k=0, j=6, MSize=8, j mod MSize=6
i=1, IdxY=0, k=0, j=7, MSize=8, j mod MSize=7
i=1, IdxY=0, k=1, j=8, MSize=8, j mod MSize=0

Am i missing something?

user207064
  • 665
  • 5
  • 14
  • 5
    If this is a compiler bug it probably won't happen in gcc. OP is using VS. – Excelcius Apr 02 '14 at 05:39
  • @Excelcius: on the other hand, at least we know there is a chance the program is correct since the output is as expected with another compiler :) – Matthieu M. Apr 02 '14 at 07:32
  • @MatthieuM. You're absolutely right, I didn't mean to make this answer look wrong, it just doesn't help much in terms of providing a solution. It helps others in finding the solution though :) – Excelcius Apr 02 '14 at 07:36
1

In addition to the other answers already provided I want to point out that you can prevent such erros with better scoping (although it's not your fault in this case, it's probably a compiler bug).

Prefer to declare the iteration variables inside the loop-scope. Or even more general: Declare variables only in the scope where they are used.

If you change the second for-loop to this:

for(short j = 1; j < Width - 1; j = j + 1) {

so that j is declared in the scope of the for-loop, the compiler must treat j as a new variable which has nothing to do with the previous empty loop. Therefore it is less likely to over-optimize by reusing previous memory locations. This small change fixes the bug in VS2013 and I consider it much cleaner than using volatile.

Excelcius
  • 1,680
  • 1
  • 14
  • 31
  • Yes, I agree with you on this point, when in C++ style. But this won't work in C style – Jedi Apr 02 '14 at 07:40
  • @Jedi Sorry I haven't noticed the C-tag. You can still declare two separate variables for the first and second loop although I understand that this is probably not what you want. – Excelcius Apr 02 '14 at 07:55