0

In the code below I saw that clang fails to perform better optimisation without implicit restrict pointer specifier:

#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>

typedef struct {
    uint32_t        event_type;
    uintptr_t       param;
} event_t;

typedef struct
{
    event_t                     *queue;
    size_t                      size;
    uint16_t                    num_of_items;
    uint8_t                     rd_idx;
    uint8_t                     wr_idx;
} queue_t;

static bool queue_is_full(const queue_t *const queue_ptr)
{
    return queue_ptr->num_of_items == queue_ptr->size;
}

static size_t queue_get_size_mask(const queue_t *const queue_ptr)
{
    return queue_ptr->size - 1;
}

int queue_enqueue(queue_t *const queue_ptr, const event_t *const event_ptr)
{
    if(queue_is_full(queue_ptr))
    {
        return 1;
    }

    queue_ptr->queue[queue_ptr->wr_idx++] = *event_ptr;
    queue_ptr->num_of_items++;
    queue_ptr->wr_idx &= queue_get_size_mask(queue_ptr);

    return 0;
}

I compiled this code with clang version 11.0.0 (clang-1100.0.32.5)

clang -O2 -arch armv7m -S test.c -o test.s

In the disassembled file I saw that the generated code re-reads the memory:

_queue_enqueue:
        .cfi_startproc
@ %bb.0:
        ldrh    r2, [r0, #8]            ---> reads the queue_ptr->num_of_items
        ldr     r3, [r0, #4]            ---> reads the queue_ptr->size
        cmp     r3, r2
        itt     eq
        moveq   r0, #1
        bxeq    lr
        ldrb    r2, [r0, #11]           ---> reads the queue_ptr->wr_idx
        adds    r3, r2, #1
        strb    r3, [r0, #11]           ---> stores the queue_ptr->wr_idx + 1
        ldr.w   r12, [r1]
        ldr     r3, [r0]
        ldr     r1, [r1, #4]
        str.w   r12, [r3, r2, lsl #3]
        add.w   r2, r3, r2, lsl #3
        str     r1, [r2, #4]
        ldrh    r1, [r0, #8]            ---> !!! re-reads the queue_ptr->num_of_items
        adds    r1, #1
        strh    r1, [r0, #8]
        ldrb    r1, [r0, #4]            ---> !!! re-reads the queue_ptr->size (only the first byte)
        ldrb    r2, [r0, #11]           ---> !!! re-reads the queue_ptr->wr_idx
        subs    r1, #1
        ands    r1, r2
        strb    r1, [r0, #11]           ---> !!! stores the updated queue_ptr->wr_idx once again after applying the mask
        movs    r0, #0
        bx      lr
        .cfi_endproc
                                        @ -- End function

After adding the restrict keyword to the pointers, these unneeded re-reads just vanished:

int queue_enqueue(queue_t * restrict const queue_ptr, const event_t * restrict const event_ptr)

I know that in clang, by default strict aliasing is disabled. But in this case, event_ptr pointer is defined as const so its object's content cannot be modified by this pointer, thus it cannot affect the content to which queue_ptr points (assuming the case when the objects overlap in the memory), right?

So is this a compiler optimisation bug or there is indeed some weird case when the object pointed by queue_ptr can be affected by event_ptr assuming this declaration:

int queue_enqueue(queue_t *const queue_ptr, const event_t *const event_ptr)

By the way, I tried to compile the same code for x86 target and inspected similar optimisation issue.


The generated assembly with the restrict keyword, doesn't contain the re-reads:

_queue_enqueue:
        .cfi_startproc
@ %bb.0:
        ldr     r3, [r0, #4]
        ldrh    r2, [r0, #8]
        cmp     r3, r2
        itt     eq
        moveq   r0, #1
        bxeq    lr
        push    {r4, r6, r7, lr}
        .cfi_def_cfa_offset 16
        .cfi_offset lr, -4
        .cfi_offset r7, -8
        .cfi_offset r6, -12
        .cfi_offset r4, -16
        add     r7, sp, #8
        .cfi_def_cfa r7, 8
        ldr.w   r12, [r1]
        ldr.w   lr, [r1, #4]
        ldrb    r1, [r0, #11]
        ldr     r4, [r0]
        subs    r3, #1
        str.w   r12, [r4, r1, lsl #3]
        add.w   r4, r4, r1, lsl #3
        adds    r1, #1
        ands    r1, r3
        str.w   lr, [r4, #4]
        strb    r1, [r0, #11]
        adds    r1, r2, #1
        strh    r1, [r0, #8]
        movs    r0, #0
        pop     {r4, r6, r7, pc}
        .cfi_endproc
                                        @ -- End function

Addition:

After some discussion with Lundin in the comments to his answer, I got the impression that the re-reads could be caused because the compiler would assume that queue_ptr->queue might potentially point to *queue_ptr itself. So I changed the queue_t struct to contain array instead of the pointer:

typedef struct
{
    event_t                     queue[256]; // changed from pointer to array with max size
    size_t                      size;
    uint16_t                    num_of_items;
    uint8_t                     rd_idx;
    uint8_t                     wr_idx;
} queue_t;

However the re-reads remained as previously. I still can't understand what could make the compiler think that queue_t fields may be modified and thus require re-reads... The following declaration eliminates the re-reads:

int queue_enqueue(queue_t * restrict const queue_ptr, const event_t *const event_ptr)

But why queue_ptr has to be declared as restrict pointer to prevent the re-reads I don't understand (unless it is a compiler optimization "bug").

P.S.

I also couldn't find any link to file/report an issue on clang that doesn't cause the compiler to crash...

Alex Lop.
  • 6,810
  • 1
  • 26
  • 45
  • 3
    `const` doesn't mean the value can't change; it means the value can't be changed using the `const` identifier. `int foo; int *a = &foo; int const *b = &foo; *a = 42 /*ok*/; *b = -1 /*nope*/;` – pmg Oct 16 '19 at 08:20
  • @pmg I meant that it cannot be changed by the `const event_t *` pointer. I'll add this clarification – Alex Lop. Oct 16 '19 at 08:35
  • @StaceyGirl Why did you remove your answer? – Alex Lop. Oct 25 '19 at 08:16
  • @AlexLop. It wasn't the register pressure. This is visible from assembly for other architectures. I think you should examine the IR to figure out what is going on since it has explicit aliasing annotations (`!tbaa X`). I checked it and saw that Clang does event copying without TBAA annotation which can explain the value flush, but at the same time I saw different compiler configurations generate might `llvm.memcpy` calls WITH `!tbaa` metadata. I might try to check this later, can't write an answer from that. –  Oct 25 '19 at 17:40
  • @StaceyGirl Thanks! I appreciate it. Will wait for your input. Also note that it is possible to reproduce similar behavior on x86 architecture and it looks weird... https://godbolt.org/z/5OVBEy – Alex Lop. Oct 25 '19 at 19:34

4 Answers4

2

The event_t member of queue_ptr could point at the same memory as event_ptr. Compilers tend to produce less efficient code when they can't rule out that two pointers point at the same memory. So there's nothing strange with restrict leading to better code.

Const qualifiers don't really matter, because these were added by the function and the original type could be modifiable elsewhere. In particular, the * const doesn't add anything because the pointer is already a local copy of the original pointer, so nobody including the caller cares if the function modifies that local copy or not.

"Strict aliasing" rather refers to the cases where the compiler can cut corners like when assuming that a uint16_t* can't point at a uint8_t* etc. But in your case you have two completely compatible types, one of them is merely wrapped in an outer struct.

Lundin
  • 195,001
  • 40
  • 254
  • 396
  • But even if they point to the same location, the only action here is copy dword by dword, so how does it "confuse" the compiler? – Alex Lop. Oct 16 '19 at 09:00
  • @AlexLop. Because you can't usually copy an object to the address where it is already stored, or copy objects that overlap. – Lundin Oct 16 '19 at 09:08
  • ok, I can understand that but I still don't get how it may impact the fields of completely different struct like `queue_t`? Why re-reading those fields? – Alex Lop. Oct 16 '19 at 09:27
  • "Const qualifiers don't really matter, because these were added by the function and the original type could be modifiable elsewhere." - This is true for many other cases even when a function gets a single pointer. But still the produced assembly doesn't always read, modify and immediately write back the value to the memory (unless it is `volatile`) – Alex Lop. Oct 16 '19 at 09:52
  • @AlexLop. The `*const` qualifier is coding style for the programmer, not to compiler. The compiler should be able to deduct if a pointer variable is modified or not inside the function, regardless of qualifiers. – Lundin Oct 16 '19 at 10:35
  • I meant `const event_t *`, not `const *` – Alex Lop. Oct 16 '19 at 10:41
  • @AlexLop. At the line `queue_ptr->queue[queue_ptr->wr_idx++] = *event_ptr;` the compiler can't know if `queue_ptr->queue` is the same address as `event_ptr`. The const qualifiers are irrelevant. – Lundin Oct 16 '19 at 10:48
  • And you want to say that this causes the compiler to reread everything after that point? Even it is irrelevant to `queue_ptr->queue` and `event_ptr`, like it is kind of a barrier? – Alex Lop. Oct 16 '19 at 10:52
  • @AlexLop. Since the compiler can't assume that it didn't change the original data, it has to re-read it. I can't reproduce your specific system but on x86 clang the `restict` made the difference that all variables could be kept in registers during the loop. – Lundin Oct 16 '19 at 11:02
  • Right, I know that `restrict` fixes it. However, in what scenario assigning `event_t` to `queue_ptr->queue[...]` could affect the original values? In case `queue_ptr->queue` points to some location inside `*queue_ptr`? If yes, the *clang* is too protective because the same code without `restrict` compiled with *gcc* doesn't perform the re-reads... – Alex Lop. Oct 16 '19 at 11:24
  • @AlexLop. It can't assume that writing to `queue_ptr->queue` won't change the contents of `event_ptr`. But yeah well it does seem like clang assumes that other unrelated members in the queue struct could be changed too because of writes to `queue_ptr->queue`. That seems a bit strange indeed. – Lundin Oct 16 '19 at 11:30
  • Ohhh, now I see what you meant... I was talking about the fields of `queue_t` because they were re-read after the assignment. I agree that the compiler may assume that the content of `*event_ptr` could be changed but it rereads the fields of `*queue_ptr`, that's why I was confused... – Alex Lop. Oct 16 '19 at 11:42
2

[talking about the original program]

This is caused by deficiency in the TBAA metadata, generated by Clang.

If you emit LLVM IR with -S -emit-llvm you'll see (snipped for brevity):

...

  %9 = load i8, i8* %wr_idx, align 1, !tbaa !12
  %10 = trunc i32 %8 to i8
  %11 = add i8 %10, -1
  %conv4 = and i8 %11, %9
  store i8 %conv4, i8* %wr_idx, align 1, !tbaa !12
  br label %return

...

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"min_enum_size", i32 4}
!2 = !{!"clang version 10.0.0 (/home/chill/src/llvm-project 07da145039e1a6a688fb2ac2035b7c062cc9d47d)"}
!3 = !{!4, !9, i64 8}
!4 = !{!"queue", !5, i64 0, !8, i64 4, !9, i64 8, !6, i64 10, !6, i64 11}
!5 = !{!"any pointer", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C/C++ TBAA"}
!8 = !{!"int", !6, i64 0}
!9 = !{!"short", !6, i64 0}
!10 = !{!4, !8, i64 4}
!11 = !{!4, !5, i64 0}
!12 = !{!4, !6, i64 11}

See the TBAA metadata !4: this is the type descriptor for queue_t (btw, I added names to structs, e.g. typedef struct queue ...) you may see empty string there). Each element in the description corresponds to struct fields and look at !5, which is the event_t *queue field: it's "any pointer"! At this point we've lost all the information about the actual type of the pointer, which tells me the compiler would assume writes through this pointer can modify any memory object.

That said, there's a new form for TBAA metadata, which is more precise (still has deficiencies, but for that later ...)

Compile the original program with -Xclang -new-struct-path-tbaa. My exact command line was (and I've included stddef.h instead of stdlib.h since that development build with no libc):

./bin/clang -I lib/clang/10.0.0/include/ -target armv7m-eabi -O2 -Xclang -new-struct-path-tbaa  -S queue.c

The resulting assembly is (again, some fluff snipped):

queue_enqueue:
    push    {r4, r5, r6, r7, lr}
    add r7, sp, #12
    str r11, [sp, #-4]!
    ldrh    r3, [r0, #8]
    ldr.w   r12, [r0, #4]
    cmp r12, r3
    bne .LBB0_2
    movs    r0, #1
    ldr r11, [sp], #4
    pop {r4, r5, r6, r7, pc}
.LBB0_2:
    ldrb    r2, [r0, #11]                   ; load `wr_idx`
    ldr.w   lr, [r0]                        ; load `queue` member
    ldrd    r6, r1, [r1]                    ; load data pointed to by `event_ptr`
    add.w   r5, lr, r2, lsl #3              ; compute address to store the event
    str r1, [r5, #4]                        ; store `param`
    adds    r1, r3, #1                      ; increment `num_of_items`
    adds    r4, r2, #1                      ; increment `wr_idx`
    str.w   r6, [lr, r2, lsl #3]            ; store `event_type`
    strh    r1, [r0, #8]                    ; store new value for `num_of_items`
    sub.w   r1, r12, #1                     ; compute size mask
    ands    r1, r4                          ; bitwise and size mask with `wr_idx`
    strb    r1, [r0, #11]                   ; store new value for `wr_idx`
    movs    r0, #0
    ldr r11, [sp], #4
    pop {r4, r5, r6, r7, pc}

Looks good, isn't it! :D

I mentioned earlier there are deficiencies with the "new struct path", but for that: on the mailing list.

PS. I'm afraid there's no general lesson to be learned in this case. In principle, the more information one is able to give the compiler - the better: things like restrict, strong typing (not gratuitous casts, type punning, etc), relevant function and variable attributes ... but not in this case, the original program already contained all the necessary information. It is just a compiler deficiency and the best way to tackle those is to raise awareness: ask on the mailing list and/or file bug reports.

chill
  • 16,470
  • 2
  • 40
  • 44
  • Thanks! This looks much more satisfying answer that everything that was posted here previously. So why not always use `-Xclang -new-struct-path-tbaa` option? What are the advantages/disadvantages of it (or perhaps it should be a separate question here...)? – Alex Lop. Oct 26 '19 at 21:08
  • @AlexLop. Don't forget to accept the answer if it answers the question (I think it does). Otherwise with lower number of upvotes on answers the bounty will expire and won't be granted to anyone. –  Oct 29 '19 at 08:23
  • @StaceyGirl Thanks Stacy, I agree this answers my question... not fully thought. I still don't see what I can learn from this for my programming/optimisations skills so that I could implement the most optimised code for infrastructure layers. Unless it is indeed can be categorised as a compiler "issue" which will be fixed in future releases and not related to my coding habits. – Alex Lop. Oct 29 '19 at 08:52
  • @chill could you please refer (or just mention) in your answer the following topics, so I could mark your answer as "accepted" : 1. How to prevent such re-reads in the future 2. What could be the downside of the flag you used to eliminate those re-reads. – Alex Lop. Oct 29 '19 at 09:01
  • @AlexLop. I'm afraid there's no general lesson to be learned in this case. In principle, the more information one is able to give the compiler - the better: things like `restrict`, strong typing (not gratuitous casts, type punning, etc), relevant function and variable attributes ... but not in this case, the original program already contained all the necessary information. It is just a compiler deficiency and the best way to tackle those is to raise awareness: ask on the mailing list and/or file bug reports. – chill Oct 29 '19 at 09:20
  • Thanks @chill. Just add what you wrote in the latest comment to the answer itself. I'll accept it and switch to the email thread for more details. – Alex Lop. Oct 29 '19 at 09:26
  • @AlexLop. I think there is something to learn here: load early and store later. You should prefer loading all required member fields at the start and store them at the end. In you case you should have load `wr_ptr`, `num_of_items` in the beginning, modify them, them store them in the end. Compiler has much more guarantees about local variables (unless their address is taken). This is something that helps even when compiler would have to reload values. –  Oct 29 '19 at 09:38
  • @StaceyGirl right but that would look like an assembly C. `local_a = global_b; local_a++; global_b = local_a` instead of just `global_b++;`... – Alex Lop. Oct 29 '19 at 10:04
1

As far as I can tell, yes, in your code queue_ptr pointee's contents cannot be modified. Is is it an optimization bug? It's a missed optimization opportunity, but I wouldn't call it a bug. It doesn't "misunderstand" const, it just doesn't have/doesn't do the necessary analyses to determine it cannot be modified for this specific scenario.

As a side note: queue_is_full(queue_ptr) can modify the contents of *queue_ptr even if it has const queue_t *const param because it can legally cast away the constness since the original object is not const. That being said, the definition of quueue_is_full is visible and available to the compiler so it can ascertain it indeed does not.

bolov
  • 72,283
  • 15
  • 145
  • 224
  • "That being said, the definition of quueue_is_full is visible and available to the compiler so it can ascertain it indeed does not." - not only that it is *just* visible, clang also inlines its content. – Alex Lop. Oct 16 '19 at 07:39
  • 4 out of 25 assembly instructions are unneeded, also ldr/str instruction can take ~3 cycles on arm cortex-m3, compared to regular mov/add/and which take 1 cycle. Taking into consideration that `-O2` is provided, I wouldn't call it just "missed opportunity" – Alex Lop. Oct 16 '19 at 07:44
1

As you know, your code appears to modify the data, invalidating the const state:

queue_ptr->num_of_items++; // this stores data in the memory

Without the restrict keyword, the compiler must assume that the two types might share the same memory space.

This is required in the updated example because event_t is a member of queue_t and strict aliasing applies on:

... an aggregate or union type that includes one of the aforementioned types among its members (including, recursively, a member of a subaggregate or contained union), or...

On the original example there are a number of reasons the types might be considered to alias, leading to the same result (i.e., the use of a char pointer and the fact that types might be considered compatible "enough" on some architectures if not all).

Hence, the compiler is required to reload the memory after it was mutated to avoid possible conflicts.

The const keyword doesn't really enter into this, because a mutation happens through a pointer that might point to the same memory address.

(EDIT) For your convenience, here's the full rule regarding access to a variable:

An object shall have its stored value accessed only by an lvalue expression that has one of the following types (88):

— a type compatible with the effective type of the object,

— a qualified version of a type compatible with the effective type of the object,

— a type that is the signed or unsigned type corresponding to the effective type of the object,

— a type that is the signed or unsigned type corresponding to a qualified version of the effective type of the object,

— an aggregate or union type that includes one of the aforementioned types among its members (including, recursively, a member of a subaggregate or contained union), or

— a character type.

(88) The intent of this list is to specify those circumstances in which an object may or may not be aliased.

P.S.

The _t suffix is reserved by POSIX. You might consider using a different suffix.

It's common practice to use _s for structs and _u for unions.

Myst
  • 18,516
  • 2
  • 45
  • 67
  • But `event_t` is not a member of `queue_t` (in the original example). –  Oct 24 '19 at 22:50
  • @StaceyGirl - true, but there are other reasons that cause this. I don't want to "lawyer" this, so I attached the list to the answer... One might argue that the `event_t` type is compatible enough to be considered for type punning (the `uint32_t` is padded, leaving the first two members aligned)... Others might note that any `char` pointer (or `uint8_t *`) is always considered a possible memory alias (in our case, `queue_ptr->wr_idx` points to an `unsigned char`, raising an "aliasing" flag)... who knows. I'm not really getting into "why they might alias" as much as into the effects. – Myst Oct 24 '19 at 23:22
  • I don't think this is the case here... for example take a look at this reproduction of the issue with various cases: https://godbolt.org/z/5OVBEy - `f5` has no re-read while `f6` does – Alex Lop. Oct 25 '19 at 08:15
  • @AlexLop. the examples you provided might not be relevant because the `char *` in `f5` isn't mutated (only the `int * ` is mutated and it doesn't suffer from potential aliasing), while `f6` has an obvious strict aliasing concern. – Myst Oct 26 '19 at 00:37