I've been experimenting with the following and have noticed that the branchless “if” defined here (now with &-!!
replacing *!!
) can speed up certain bottleneck code by as much as (almost) 2x on 64-bit Intel targets with clang:
// Produces x if f is true, else 0 if f is false.
#define BRANCHLESS_IF(f,x) ((x) & -((typeof(x))!!(f)))
// Produces x if f is true, else y if f is false.
#define BRANCHLESS_IF_ELSE(f,x,y) (((x) & -((typeof(x))!!(f))) | \
((y) & -((typeof(y)) !(f))))
Note that f
should be a reasonably simple expression with no side-effects, so that the compiler is able to do its best optimizations.
Performance is highly dependent on CPU and compiler. The branchless ‘if’ performance is excellent with clang; I haven't found any cases yet where the branchless ‘if/else’ is faster, though.
My question is: are these safe and portable as written (meaning guaranteed to give correct results on all targets), and can they be made faster?
Example usage of branchless if/else
These compute 64-bit minimum and maximum.
inline uint64_t uint64_min(uint64_t a, uint64_t b)
{
return BRANCHLESS_IF_ELSE((a <= b), a, b);
}
inline uint64_t uint64_max(uint64_t a, uint64_t b)
{
return BRANCHLESS_IF_ELSE((a >= b), a, b);
}
Example usage of branchless if
This is 64-bit modular addition — it computes (a + b) % n
. The branching version (not shown) suffers terribly from branch prediction failures, but the branchless version is very fast (at least with clang).
inline uint64_t uint64_add_mod(uint64_t a, uint64_t b, uint64_t n)
{
assert(n > 1); assert(a < n); assert(b < n);
uint64_t c = a + b - BRANCHLESS_IF((a >= n - b), n);
assert(c < n);
return c;
}
Update: Full concrete working example of branchless if
Below is a full working C11 program that demonstrates the speed difference between branching and a branchless versions of a simple if
conditional, if you would like to try it on your system. The program computes modular exponentiation, that is (a ** b) % n
, for extremely large values.
To compile, use the following on the command line:
-O3
(or whatever high optimization level you prefer)-DNDEBUG
(to disable assertions, for speed)- Either
-DBRANCHLESS=0
or-DBRANCHLESS=1
to specify branching or branchless behavior, respectively
On my system, here's what happens:
$ cc -DBRANCHLESS=0 -DNDEBUG -O3 -o powmod powmod.c && ./powmod
BRANCHLESS = 0
CPU time: 21.83 seconds
foo = 10585369126512366091
$ cc -DBRANCHLESS=1 -DNDEBUG -O3 -o powmod powmod.c && ./powmod
BRANCHLESS = 1
CPU time: 11.76 seconds
foo = 10585369126512366091
$ cc --version
Apple LLVM version 6.0 (clang-600.0.57) (based on LLVM 3.5svn)
Target: x86_64-apple-darwin14.1.0
Thread model: posix
So, the branchless version is almost twice as fast as the branching version on my system (3.4 GHz. Intel Core i7).
// SPEED TEST OF MODULAR MULTIPLICATION WITH BRANCHLESS CONDITIONALS
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
#include <time.h>
#include <assert.h>
typedef uint64_t uint64;
//------------------------------------------------------------------------------
#if BRANCHLESS
// Actually branchless.
#define BRANCHLESS_IF(f,x) ((x) & -((typeof(x))!!(f)))
#define BRANCHLESS_IF_ELSE(f,x,y) (((x) & -((typeof(x))!!(f))) | \
((y) & -((typeof(y)) !(f))))
#else
// Not actually branchless, but used for comparison.
#define BRANCHLESS_IF(f,x) ((f)? (x) : 0)
#define BRANCHLESS_IF_ELSE(f,x,y) ((f)? (x) : (y))
#endif
//------------------------------------------------------------------------------
// 64-bit modular multiplication. Computes (a * b) % n without division.
static uint64 uint64_mul_mod(uint64 a, uint64 b, const uint64 n)
{
assert(n > 1); assert(a < n); assert(b < n);
if (a < b) { uint64 t = a; a = b; b = t; } // Ensure that b <= a.
uint64 c = 0;
for (; b != 0; b /= 2)
{
// This computes c = (c + a) % n if (b & 1).
c += BRANCHLESS_IF((b & 1), a - BRANCHLESS_IF((c >= n - a), n));
assert(c < n);
// This computes a = (a + a) % n.
a += a - BRANCHLESS_IF((a >= n - a), n);
assert(a < n);
}
assert(c < n);
return c;
}
//------------------------------------------------------------------------------
// 64-bit modular exponentiation. Computes (a ** b) % n using modular
// multiplication.
static
uint64 uint64_pow_mod(uint64 a, uint64 b, const uint64 n)
{
assert(n > 1); assert(a < n);
uint64 c = 1;
for (; b > 0; b /= 2)
{
if (b & 1)
c = uint64_mul_mod(c, a, n);
a = uint64_mul_mod(a, a, n);
}
assert(c < n);
return c;
}
//------------------------------------------------------------------------------
int main(const int argc, const char *const argv[const])
{
printf("BRANCHLESS = %d\n", BRANCHLESS);
clock_t clock_start = clock();
#define SHOW_RESULTS 0
uint64 foo = 0; // Used in forcing compiler not to throw away results.
uint64 n = 3, a = 1, b = 1;
const uint64 iterations = 1000000;
for (uint64 iteration = 0; iteration < iterations; iteration++)
{
uint64 c = uint64_pow_mod(a%n, b, n);
if (SHOW_RESULTS)
{
printf("(%"PRIu64" ** %"PRIu64") %% %"PRIu64" = %"PRIu64"\n",
a%n, b, n, c);
}
else
{
foo ^= c;
}
n = n * 3 + 1;
a = a * 5 + 3;
b = b * 7 + 5;
}
clock_t clock_end = clock();
double elapsed = (double)(clock_end - clock_start) / CLOCKS_PER_SEC;
printf("CPU time: %.2f seconds\n", elapsed);
printf("foo = %"PRIu64"\n", foo);
return 0;
}
Second update: Intel vs. ARM performance
- Testing on 32-bit ARM targets (iPhone 3GS/4S, iPad 1/2/3/4, as compiled by Xcode 6.1 with clang) reveals that the branchless “if” here is actually about 2–3 times slower than ternary
?:
for the modular exponentiation code in those cases. So it seems that these branchless macros are not a good idea if maximum speed is needed, although they might be useful in rare cases where constant speed is needed. - On 64-bit ARM targets (iPhone 6+, iPad 5), the branchless “if” runs the same speed as ternary
?:
— again as compiled by Xcode 6.1 with clang. - For both Intel and ARM (as compiled by clang), the branchless “if/else” was about twice as slow as ternary
?:
for computing min/max.