gcc allocate memory for variant-array

Question

I hace a question about gcc allocate memory for a variant array. If the C code is like below.

long vframe(long idx, long n, long *q) {
        long i = 1;
        long *p[n];
        p[0] = & i;
        for (i = 1; i < n; i++) {
                p[i] = q;
        }
        return *p[idx];
}

With gcc and objdump I can get its assembly code, like below:

0000000000001139 <vframe>:
    1139:   55                      push   %rbp
    113a:   48 89 e5                mov    %rsp,%rbp
    113d:   53                      push   %rbx
    113e:   48 83 ec 48             sub    $0x48,%rsp
    1142:   48 89 7d c8             mov    %rdi,-0x38(%rbp)
    1146:   48 89 75 c0             mov    %rsi,-0x40(%rbp)
    114a:   48 89 55 b8             mov    %rdx,-0x48(%rbp)
    114e:   48 89 e0                mov    %rsp,%rax
    1151:   48 89 c6                mov    %rax,%rsi
    1154:   48 c7 45 d8 01 00 00    movq   $0x1,-0x28(%rbp)
    115b:   00 
    115c:   48 8b 45 c0             mov    -0x40(%rbp),%rax
    1160:   48 8d 50 ff             lea    -0x1(%rax),%rdx
    1164:   48 89 55 e8             mov    %rdx,-0x18(%rbp)
    1168:   48 89 c2                mov    %rax,%rdx
    116b:   49 89 d0                mov    %rdx,%r8
    116e:   41 b9 00 00 00 00       mov    $0x0,%r9d
    1174:   48 89 c2                mov    %rax,%rdx
    1177:   48 89 d1                mov    %rdx,%rcx
    117a:   bb 00 00 00 00          mov    $0x0,%ebx
    117f:   48 8d 14 c5 00 00 00    lea    0x0(,%rax,8),%rdx
    1186:   00 
    1187:   b8 10 00 00 00          mov    $0x10,%eax
    118c:   48 83 e8 01             sub    $0x1,%rax
    1190:   48 01 d0                add    %rdx,%rax
    1193:   bb 10 00 00 00          mov    $0x10,%ebx
    1198:   ba 00 00 00 00          mov    $0x0,%edx
    119d:   48 f7 f3                div    %rbx
    11a0:   48 6b c0 10             imul   $0x10,%rax,%rax
    11a4:   48 29 c4                sub    %rax,%rsp
    11a7:   48 89 e0                mov    %rsp,%rax
    11aa:   48 83 c0 07             add    $0x7,%rax
    11ae:   48 c1 e8 03             shr    $0x3,%rax
    11b2:   48 c1 e0 03             shl    $0x3,%rax
    11b6:   48 89 45 e0             mov    %rax,-0x20(%rbp)
    11ba:   48 8b 45 e0             mov    -0x20(%rbp),%rax
    11be:   48 8d 55 d8             lea    -0x28(%rbp),%rdx
    11c2:   48 89 10                mov    %rdx,(%rax)
    11c5:   48 c7 45 d8 01 00 00    movq   $0x1,-0x28(%rbp)
    11cc:   00 
    11cd:   eb 1c                   jmp    11eb <vframe+0xb2>
    11cf:   48 8b 55 d8             mov    -0x28(%rbp),%rdx
    11d3:   48 8b 45 e0             mov    -0x20(%rbp),%rax
    11d7:   48 8b 4d b8             mov    -0x48(%rbp),%rcx
    11db:   48 89 0c d0             mov    %rcx,(%rax,%rdx,8)
    11df:   48 8b 45 d8             mov    -0x28(%rbp),%rax
    11e3:   48 83 c0 01             add    $0x1,%rax
    11e7:   48 89 45 d8             mov    %rax,-0x28(%rbp)
    11eb:   48 8b 45 d8             mov    -0x28(%rbp),%rax
    11ef:   48 39 45 c0             cmp    %rax,-0x40(%rbp)
    11f3:   7f da                   jg     11cf <vframe+0x96>
    11f5:   48 8b 45 e0             mov    -0x20(%rbp),%rax
    11f9:   48 8b 55 c8             mov    -0x38(%rbp),%rdx
    11fd:   48 8b 04 d0             mov    (%rax,%rdx,8),%rax
    1201:   48 8b 00                mov    (%rax),%rax
    1204:   48 89 f4                mov    %rsi,%rsp
    1207:   48 8b 5d f8             mov    -0x8(%rbp),%rbx
    120b:   c9                      leave  
    120c:   c3                      ret

Here is the part I do not understand: Why do we still need to set %rax's lower 3bits to 0, even after we make sure %rax is %rsp sub a number which is a multiple of 16. Does that mean the %rsp may be a number which is not a multiple of 8? If so, in which case does that happen?

    11a4:   48 29 c4                sub    %rax,%rsp
    11a7:   48 89 e0                mov    %rsp,%rax
    11aa:   48 83 c0 07             add    $0x7,%rax
    11ae:   48 c1 e8 03             shr    $0x3,%rax
    11b2:   48 c1 e0 03             shl    $0x3,%rax

You compiled with optimization disabled, so of course it's inefficient. (Hilariously so, even using `div` and `imul` instead of `and $-16, %reg`.) Presumably the GCC logic that maintains stack alignment is separate from the part that makes sure the actual array allocation itself is aligned by 16 (as x86-64 SysV says all local/global arrays must be, unless they're constant-size and smaller than 16B). e.g. in case you compiled with `-mprefered-stack-boundary=2` or something might be the reason, or it might just be redundant stuff not optimized away. — Peter Cordes, Nov 04 '21 at 10:01
You told the compiler to turn off its brain and now you expect every instruction it generates to be meaningful? Compile with optimisation enabled and you'll get a much better result. — fuz, Nov 04 '21 at 10:09
After fixing the return type and enabling optimizations, I was actually kind of surprised that none of the compilers was able to see through this whole charade and compile `return (idx == 0) ? 1 : *q;` — Nate Eldredge, Nov 04 '21 at 14:23
@Nate Eldredge a typo, i compile it with `long vframe(long idx, long n, long *q)` — marcocr xu, Nov 05 '21 at 06:45

gcc allocate memory for variant-array

0 Answers0