4

Optimization for divided by a constant is well optimized by gcc, as is well known :)

Now I wonder how dividing a constant is optimized. gcc does not help me out, and so does clang.

Maybe I am not good at searching such information, but I cannot find a material about optimization for dividing constant. (In contrast, division by constant is well introduced.)

#include <stdio.h>

int f(int x)
{
    // can I optimize off the idiv opcode here?
    return 33659/x;
}

int main()
{
    int x;
    scanf("%d", &x);
    printf("%d", f(x));
    return 0;
}

EDIT1:

#include <stdio.h>

#define DIVIDEND 33

void f ( unsigned int* arr, int n )
{
    for ( int i = 0; i < n ; i++ )
    {
        arr[i] = DIVIDEND / arr[i];
    }
}

int main()
{
    const int n = 1024;
    unsigned int buf[n];
    for ( int i = 0; i < n; i++ )
    {
        scanf ( "%u", buf + i );
    }
    f ( buf, n );
    for ( int i = 0; i < n; i++ )
    {
        printf ( "%d", buf[i] );
    }
    return 0;
}

Optimized with clang -O3 -march=native div.c -o div only unrolls the loop, whilst:

#include <stdio.h>

#define DIVIDEND 33
#define DIVISOR DIVIDEND

void f ( unsigned int* arr, int n )
{
    for ( int i = 0; i < n ; i++ )
    {
        //arr[i] = DIVIDEND / arr[i];
        arr[i] = arr[i] / DIVISOR;
    }
}

int main()
{
    const int n = 1024;
    unsigned int buf[n];
    for ( int i = 0; i < n; i++ )
    {
        scanf ( "%u", buf + i );
    }
    f ( buf, n );
    for ( int i = 0; i < n; i++ )
    {
        printf ( "%d", buf[i] );
    }
    return 0;
}

using the same command line will yield a pile of terrifying AVX2 code. (Remember that division by constant is rewritten into shift+mul+add, which can be vectorized!)

EDIT2: Thank @user2722968 ! Applying RCPPS will make the program faster.

Here is my experimental implementation using RCPPS for fast constant-dividend division:

https://github.com/ThinerDAS/didactic-spoon/blob/master/div.c

However, I am not sure how to make it more accurate without large overhead.

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Thiner
  • 345
  • 1
  • 9

1 Answers1

1

If you can trigger a really good optimization for "divided by" then you might benefit from computing the reciprocal of x/33659 using the RCPPS instruction (which does use SSE/AVX).

user2722968
  • 13,636
  • 2
  • 46
  • 67
  • RCPPS is a good hint! This instruction is a very rough approximation, and is not reliable, however it is very fast. Hard to trigger the instruction :( I will try it out. – Thiner Jul 02 '17 at 06:31