I am attempting to use a 64-bits integral as a bitmap, and acquire/release ownership of individual bits, atomically.
To this end, I have written the following lock-less code:
#include <cstdint>
#include <atomic>
static constexpr std::uint64_t NO_INDEX = ~std::uint64_t(0);
class AtomicBitMap {
public:
static constexpr std::uint64_t occupied() noexcept {
return ~std::uint64_t(0);
}
std::uint64_t acquire() noexcept {
while (true) {
auto map = mData.load(std::memory_order_relaxed);
if (map == occupied()) {
return NO_INDEX;
}
std::uint64_t index = __builtin_ctzl(~map);
auto previous =
mData.fetch_or(bit(index), std::memory_order_relaxed);
if ((previous & bit(index)) == 0) {
return index;
}
}
}
private:
static constexpr std::uint64_t bit(std::uint64_t index) noexcept {
return std::uint64_t(1) << index;
}
std::atomic_uint64_t mData{ 0 };
};
int main() {
AtomicBitMap map;
return map.acquire();
}
Which, on godbolt, yields the following assembly in isolation:
main:
mov QWORD PTR [rsp-8], 0
jmp .L3
.L10:
not rax
rep bsf rax, rax
mov edx, eax
mov eax, eax
lock bts QWORD PTR [rsp-8], rax
jnc .L9
.L3:
mov rax, QWORD PTR [rsp-8]
cmp rax, -1
jne .L10
ret
.L9:
movsx rax, edx
ret
Which is exactly what I expected1.
@Jester has heroically managed to reduce my 97 lines reproducer to a much simpler 44 lines reproducer which I further reduced to 35 lines:
using u64 = unsigned long long;
struct Bucket {
u64 mLeaves[16] = {};
};
struct BucketMap {
u64 acquire() noexcept {
while (true) {
u64 map = mData;
u64 index = (map & 1) ? 1 : 0;
auto mask = u64(1) << index;
auto previous =
__atomic_fetch_or(&mData, mask, __ATOMIC_SEQ_CST);
if ((previous & mask) == 0) {
return index;
}
}
}
__attribute__((noinline)) Bucket acquireBucket() noexcept {
acquire();
return Bucket();
}
volatile u64 mData = 1;
};
int main() {
BucketMap map;
map.acquireBucket();
return 0;
}
Which generates the following assembly:
BucketMap::acquireBucket():
mov r8, rdi
mov rdx, rsi
.L2:
mov rax, QWORD PTR [rsi]
xor eax, eax
lock bts QWORD PTR [rdx], rax
setc al
jc .L2
mov rdi, r8
mov ecx, 16
rep stosq
mov rax, r8
ret
main:
sub rsp, 152
lea rsi, [rsp+8]
lea rdi, [rsp+16]
mov QWORD PTR [rsp+8], 1
call BucketMap::acquireBucket()
xor eax, eax
add rsp, 152
ret
The xor eax,eax
means that the assembly here always attempts to obtain index 0... resulting in an infinite loop.
I can only see two explanations for this assembly:
- I have somehow triggered Undefined Behavior.
- There is a code-generation bug in gcc.
And I have exhausted all my ideas as to what could trigger UB.
Can anyone explain why gcc would generate this xor eax,eax
?
Note: tentatively reported to gcc as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86314.
Compiler version used:
$ gcc --version
gcc (GCC) 7.3.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is
NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE.
Compiler flags:
-Wall -Wextra -Werror -Wduplicated-cond -Wnon-virtual-dtor -Wvla
-rdynamic -Wno-deprecated-declarations -Wno-type-limits
-Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unused-value
-Wno-aligned-new -Wno-implicit-fallthrough -Wno-deprecated
-Wno-noexcept-type -Wno-register -ggdb -fno-strict-aliasing
-std=c++17 -Wl,--no-undefined -Wno-sign-compare
-g -O3 -mpopcnt
1 Actually, it's better than I expected, the compiler understanding that the fetch_or(bit(index))
followed by previous & bit(index)
is the equivalent of using bts
and checking the CF flag is pure gold.