4

I try to compile this code and use loop-specific pragmas to tell the compiler how many times to unroll a counted loop.

#include <vector>
int main() {
  std::vector<int> v(8192);
#pragma GCC unroll 8 // 16
  for (int i = 0; i < 16; i++) {
    for (int j = 0; j < 512; j++) {
      v[i*512+j] = i*j;
    }
  }
  return 0;
}

When I place the #pragma GCC unroll 8 before the outer for loop, the compiler does't unroll.

.L3:
        movd    xmm7, ecx
        mov     rax, rsi
        movdqa  xmm2, xmm6
        pshufd  xmm3, xmm7, 0
        movdqa  xmm4, xmm3
        psrlq   xmm4, 32
.L4:
        movdqa  xmm0, xmm2
        movdqa  xmm1, xmm3
        paddd   xmm2, xmm5
        add     rax, 16
        pmuludq xmm1, xmm0
        psrlq   xmm0, 32
        pmuludq xmm0, xmm4
        pshufd  xmm1, xmm1, 8
        pshufd  xmm0, xmm0, 8
        punpckldq       xmm1, xmm0
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L4
        add     ecx, 1
        add     rsi, 2048
        lea     rdx, [rax+2048]
        cmp     ecx, 16
        jne     .L3
        mov     rdi, rbp
        mov     esi, 16384
        call    _ZdlPvm
        xor     eax, eax
        pop     rbp
        ret

But when I place the #pragma GCC unroll 16 before the outer for loop, the compiler unroll the outer loop successfully.

.L2:
        lea     rdi, [rbp+8]
        mov     rcx, rbp
        movdqa  xmm2, XMMWORD PTR .LC0[rip]
        xor     eax, eax
        and     rdi, -8
        movdqa  xmm0, XMMWORD PTR .LC1[rip]
        mov     QWORD PTR [rbp+0], 0
        lea     rdx, [rbp+4096]
        sub     rcx, rdi
        movdqa  xmm1, xmm2
        mov     QWORD PTR [rbp+2040], 0
        add     ecx, 2048
        shr     ecx, 3
        rep stosq
        lea     rax, [rbp+2048]
.L3:
        movdqa  xmm3, xmm1
        add     rax, 16
        paddd   xmm1, xmm0
        movups  XMMWORD PTR [rax-16], xmm3
        cmp     rax, rdx
        jne     .L3
        lea     rdx, [rbp+6144]
        movdqa  xmm3, xmm2
.L4:
        movdqa  xmm1, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        pslld   xmm1, 1
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L4
        lea     rdx, [rbp+8192]
        movdqa  xmm3, xmm2
.L5:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 1
        paddd   xmm1, xmm4
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L5
        mov     rax, rdx
        movdqa  xmm3, xmm2
        lea     rdx, [rbp+10240]
.L6:
        movdqa  xmm1, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        pslld   xmm1, 2
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L6
        mov     rdx, rax
        movdqa  xmm3, xmm2
        lea     rax, [rbp+12288]
.L7:
        movdqa  xmm4, xmm3
        add     rdx, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 2
        paddd   xmm1, xmm4
        movups  XMMWORD PTR [rdx-16], xmm1
        cmp     rax, rdx
        jne     .L7
        lea     rdx, [rbp+14336]
        movdqa  xmm3, xmm2
.L8:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 1
        paddd   xmm1, xmm4
        pslld   xmm1, 1
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L8
        movdqa  xmm3, xmm2
.L9:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 3
        psubd   xmm1, xmm4
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rbx
        jne     .L9
        lea     rdx, [rbp+18432]
        movdqa  xmm3, xmm2
.L10:
        movdqa  xmm1, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        pslld   xmm1, 3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L10
        lea     rdx, [rbp+20480]
        movdqa  xmm3, xmm2
.L11:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 3
        paddd   xmm1, xmm4
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L11
        lea     rax, [rbp+22528]
        movdqa  xmm3, xmm2
.L12:
        movdqa  xmm4, xmm3
        add     rdx, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 2
        paddd   xmm1, xmm4
        pslld   xmm1, 1
        movups  XMMWORD PTR [rdx-16], xmm1
        cmp     rax, rdx
        jne     .L12
        lea     rdx, [rbp+24576]
        movdqa  xmm4, xmm2
.L13:
        movdqa  xmm3, xmm4
        add     rax, 16
        paddd   xmm4, xmm0
        movdqa  xmm1, xmm3
        pslld   xmm1, 1
        paddd   xmm1, xmm3
        pslld   xmm1, 2
        psubd   xmm1, xmm3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L13
        lea     rdx, [rbp+26624]
        movdqa  xmm3, xmm2
.L14:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 1
        paddd   xmm1, xmm4
        pslld   xmm1, 2
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L14
        lea     rdx, [rbp+28672]
        movdqa  xmm4, xmm2
.L15:
        movdqa  xmm3, xmm4
        add     rax, 16
        paddd   xmm4, xmm0
        movdqa  xmm1, xmm3
        pslld   xmm1, 1
        paddd   xmm1, xmm3
        pslld   xmm1, 2
        paddd   xmm1, xmm3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rax, rdx
        jne     .L15
        lea     rdx, [rbp+30720]
        movdqa  xmm3, xmm2
.L16:
        movdqa  xmm4, xmm3
        add     rax, 16
        paddd   xmm3, xmm0
        movdqa  xmm1, xmm4
        pslld   xmm1, 3
        psubd   xmm1, xmm4
        pslld   xmm1, 1
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L16
        mov     rax, rdx
        lea     rdx, [rbp+32768]
.L17:
        movdqa  xmm3, xmm2
        add     rax, 16
        paddd   xmm2, xmm0
        movdqa  xmm1, xmm3
        pslld   xmm1, 4
        psubd   xmm1, xmm3
        movups  XMMWORD PTR [rax-16], xmm1
        cmp     rdx, rax
        jne     .L17
        mov     rdi, rbp
        mov     esi, 16384
        call    _ZdlPvm
        add     rsp, 8
        xor     eax, eax
        pop     rbx
        pop     rbp
        ret

So does compiler only unroll the outer loop completely?

GCC version: g++ (Compiler-Explorer-Build-gcc-b8ef019ab938471f7f877a1eee3a6374fd8a6ae9-binutils-2.36.1) 12.0.0 20211029 (experimental)

Option: -O2

godbolt: https://godbolt.org/z/zq7TWesY9

Cache
  • 45
  • 7
  • Documentation for this is somewhat skimpy, but it seems your unroll factor must be greater or equal to an explicit loop count. – 500 - Internal Server Error Nov 15 '21 at 13:23
  • I think the unroll factor should be less or equal to an explicit loop count. And the values of 0 and 1 will block any unrolling of the loop. If it is greater, the loop will be unroll completely. – Cache Nov 15 '21 at 13:29

1 Answers1

2

https://godbolt.org/z/PT6T1691W it seems that -O2 -funroll-loops does the trick, apparently that option needs to be on for the pragma to tell GCC how much to unroll. (Update: Or at least makes it have some effect. See comments, this doesn't seem to be a complete answer yet.)

(-funroll-loops is not on by default unless you use -fprofile-use, after doing a -fprofile-generate run and running the program with representative input. It used to be on by default at -O3 a long time ago, but code bloat I-cache pressure usually made that worse for loops that aren't hot. This leads to bass-ackwards situations where the loop where GCC spends most of its time is a few instructions long with SIMD, but the fully-unrolled scalar prologue / epilogue are 10x the number of instructions, especially with wider vectors. Even with AVX-512, GCC usually just uses scalar for odd numbers of elements, not creating a mask. :/)


Fully unrolling loops is something GCC will do even at -O2, at least for very small trip-counts. (e.g. up to 3 for an int array p[i] += 1;, with -O2 -fno-tree-vectorize). https://godbolt.org/z/P5rvjYj1b

Fully-unrolling larger loops or higher trip counts (when the static code size would increase from doing so, perhaps) is not on by default at -O2 it seems. (GCC calls this peeling a loop in their tuning options/parameters, i.e. peeling all the iterations out of the loop so it goes away. -fpeel-loops is on with -O3, but not -O2. Since GCC11, -fverbose-asm no longer prints a list of optimization options enabled as asm comments.)

And BTW, it seems auto-vectorization is on by default at -O2 now in GCC trunk. Previously it was only on at -O3, so that's interesting.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • In your Godbolt, https://godbolt.org/z/PT6T1691W, seems it only unroll the inner loop 4 times in label `.L4`, and does't unroll the outer loop. Apparently compiler does't follow our indication to unroll the loop. – Cache Nov 16 '21 at 08:25
  • @Cache: Hmm, I wrote this answer quickly without looking super carefully. There's some kind ofinteraction with SIMD auto-vectorization, which is effectively rolling a loop back up by doing 4 iterations (in this case) of the C source with one sequence of asm instructions. https://godbolt.org/z/eaavYvTeW with `-fno-tree-vectorize -funroll-loops` unrolls by 8. But that's the same unroll factor it chooses without the pragma. Only `#pragma GCC unroll 16` has any impact on the generated asm in that case; 2 and 4 don't reduce it, and 1 doesn't disable it. 16 goes crazy, perhaps doing 8 inner? – Peter Cordes Nov 16 '21 at 08:49
  • Thanks for your reply. So is compiler seldom choose to unroll the outer loop? I haven't seen this situation except peeling outer loop :) – Cache Nov 16 '21 at 08:57
  • Just like inline, we can only suggest compiler to unroll rather than force it to unroll with a specified factor. – Cache Nov 16 '21 at 09:02
  • @Cache: IDK, I mean literally just unrolling an outer loop would mean doing 4 separate loops in sequence. But yeah I guess you'd still call it unrolling the outer loop when you actually do 4 or 8 outer iterations per inner loop. It does change the order of operations, unlike simple unrolls of the inner loop. In this case, that would create four or eight output streams at strides of 512 * sizeof(int) = 4k from each other (so aliasing in L1d is guaranteed). That many output streams would usually be a bad thing, and would probably take more registers, so I can see why GCC wouldn't. – Peter Cordes Nov 16 '21 at 09:10
  • I agree with you that the second situation can cause strided memory references when the inner loop trip count is large. And the first situation which does 4 separate loops in sequence seems doesn't make sense also. – Cache Nov 16 '21 at 09:31