I am restricting my comments to integer division, because to first order the modulo operation in C++ can be viewed and implemented as an integer division plus back-multiply and subtraction, although in some cases, there are cheaper ways of computing the modulo directly, e.g. when computing modulo 2n.
Integer division is pretty slow on most platforms, based on either software emulation or an iterative hardware implementation. But it was widely reported last year that based on microbenchmarking on Apple's M1, it has a blazingly fast integer division, presumably by using dedicated circuitry.
Ever since a seminal paper by Torbjörn Granlund and Peter Montgomery almost thirty years ago it has been widely known how to replace integer divisions with constant divisors by using an integer multiply plus possibly a shift and / or other correction steps. This algorithm is often referred to as the magic-multiplier technique. It requires precomputation of some relevant parameters from the integer divisor for use in the multiply-based emulation sequence.
Torbjörn Granlund and Peter L. Montgomery, "Division by invariant integers using multiplication," ACM SIGPLAN Notices, Vol. 29, June 1994, pp. 61-72 (online).
At current, all major toolchains incorporate variants of the Granlund-Montgomery algorithm when dealing with integer divisors that are compile-time constant. The pre-computation occurs at compilation time inside the compiler, which then emits code using the computed parameters. Some toolchains may also use this algorithm for divisions by run-time constant divisors that are used repeatedly. For run-time constant divisors in loops, this could involve emitting a pre-computation block prior to a loop to compute the necessary parameters, and then using those for the division emulation code inside the loop.
If one's toolchain does not optimize divisions with run-time constant divisor one can use the same approach manually as demonstrated by the code below. However, this is unlikely to achieve the same efficiency as a compiler-based solution, because not all machine operations used in the desired emulation sequence can be expressed efficiently at C++ level in a portable manner. This applies in particular to arithmetic right shifts and add-with-carry.
The code below demonstrates the principle of parameter precomputation and integer division emulation via multiplication. It is quite likely that by investing more time into the design than I was willing to expend for this answer more efficient implementations of both parameter precomputation and emulation can be identified.
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#define PORTABLE (1)
uint32_t ilog2 (uint32_t i)
{
uint32_t t = 0;
i = i >> 1;
while (i) {
i = i >> 1;
t++;
}
return (t);
}
/* Based on: Granlund, T.; Montgomery, P.L.: "Division by Invariant Integers
using Multiplication". SIGPLAN Notices, Vol. 29, June 1994, pp. 61-72
*/
void prepare_magic (int32_t divisor, int32_t &multiplier, int32_t &add_mask, int32_t &sign_shift)
{
uint32_t divisoru, d, n, i, j, two_to_31 = uint32_t (1) << 31;
uint64_t m_lower, m_upper, k, msb, two_to_32 = uint64_t (1) << 32;
divisoru = uint32_t (divisor);
d = (divisor < 0) ? (0 - divisoru) : divisoru;
i = ilog2 (d);
j = two_to_31 % d;
msb = two_to_32 << i;
k = msb / (two_to_31 - j);
m_lower = msb / d;
m_upper = (msb + k) / d;
n = ilog2 (uint32_t (m_lower ^ m_upper));
n = (n > i) ? i : n;
m_upper = m_upper >> n;
i = i - n;
multiplier = int32_t (uint32_t (m_upper));
add_mask = (m_upper >> 31) ? (-1) : 0;
sign_shift = int32_t ((divisoru & two_to_31) | i);
}
int32_t arithmetic_right_shift (int32_t a, int32_t s)
{
uint32_t msb = uint32_t (1) << 31;
uint32_t ua = uint32_t (a);
ua = ua >> s;
msb = msb >> s;
return int32_t ((ua ^ msb) - msb);
}
int32_t magic_division (int32_t dividend, int32_t multiplier, int32_t add_mask, int32_t sign_shift)
{
int64_t prod = int64_t (dividend) * multiplier;
int32_t quot = (int32_t)(uint64_t (prod) >> 32);
quot = int32_t (uint32_t (quot) + (uint32_t (dividend) & uint32_t (add_mask)));
#if PORTABLE
const int32_t byte_mask = 0xff;
quot = arithmetic_right_shift (quot, sign_shift & byte_mask);
#else // PORTABLE
quot = quot >> sign_shift; // must mask shift count & use arithmetic right shift
#endif // PORTABLE
quot = int32_t (uint32_t (quot) + (uint32_t (dividend) >> 31));
if (sign_shift < 0) quot = -quot;
return quot;
}
int main (void)
{
int32_t multiplier;
int32_t add_mask;
int32_t sign_shift;
int32_t divisor;
for (divisor = -20; divisor <= 20; divisor++) {
/* avoid division by zero */
if (divisor == 0) {
divisor++;
continue;
}
printf ("divisor=%d\n", divisor);
prepare_magic (divisor, multiplier, add_mask, sign_shift);
printf ("multiplier=%d add_mask=%d sign_shift=%d\n",
multiplier, add_mask, sign_shift);
printf ("exhaustive test of dividends ... ");
uint32_t dividendu = 0;
do {
int32_t dividend = (int32_t)dividendu;
/* avoid overflow in signed integer division */
if ((divisor == (-1)) && (dividend == ((-2147483647)-1))) {
dividendu++;
continue;
}
int32_t res = magic_division (dividend, multiplier, add_mask, sign_shift);
int32_t ref = dividend / divisor;
if (res != ref) {
printf ("\nERR dividend=%d (%08x) divisor=%d res=%d ref=%d\n",
dividend, (uint32_t)dividend, divisor, res, ref);
return EXIT_FAILURE;
}
dividendu++;
} while (dividendu);
printf ("PASSED\n");
}
return EXIT_SUCCESS;
}