Most insanely efficient way to find index of the minimum of four numbers

Question

#include <iostream>
#include <chrono>
#include <random>
using namespace std;

class MyTimer
{
private:
    std::chrono::time_point<std::chrono::steady_clock> starter;
    std::chrono::time_point<std::chrono::steady_clock> ender;

public:
    void startCounter() {
        starter = std::chrono::steady_clock::now();
    }

    long long getCounter() {
        ender = std::chrono::steady_clock::now();
        return std::chrono::duration_cast<std::chrono::microseconds>(ender - starter).count();
    }
};

int findBestKey(int keys[4], int values[4])
{
    int index = 0;
    for (int i = 1; i <= 3; i++) 
        if (keys[index] > keys[i]) 
            index = i;
    return values[index];
}

int findBestKeyPro(int keys[4], int values[4])
{
    int index = keys[0] > keys[1];
    if (keys[index] > keys[2]) index = 2;
    if (keys[index] > keys[3]) return values[3];
    else return values[index];
}
 
int findBestKeyProMax(int keys[4], int values[4])
{
    // fill your implementation here. Not necessary to read the parts below
    return 0;
}

void benchMethod(int (*findBestKeyFunc)(int keys[4], int values[4]), int n, int* keys, int* values, int& res, double& totalTime)
{
    MyTimer timer;
    timer.startCounter();
    // In my actual problems, values of arrays "keys" are completely unrelated. They are not the same continuous values in memory. The line below is just an example for benchmark purposes
    for (int i = 0; i < n - 4; i+=4)
        res += findBestKeyFunc(&keys[i], &values[i]);
    totalTime += timer.getCounter();
   
    /*
    it is possible to calculate 4 arrays "keys","values", then process them all at once. 
    for (int i=0; i<n-4; i+=16)
    {
       keys[4][4] = ...; values[4][4] = ...;
       res += find4BestKeyAtOnce(&keys, &values);             
    }
    */
}

double totalTimeNormal = 0, totalTimePro = 0, totalTimeProMax = 0;
void benching(int& res1, int& res2, int& res3)
{
    const int n = 10000000;
    int* keys1 = new int[n], * values1 = new int[n];
    int* keys2 = new int[n], * values2 = new int[n];
    MyTimer timer;
    double tmp;

    for (int i = 0; i < n; i++) {
        keys1[i] = rand() % 100; // need 2 arrays to prevent caching    
        keys2[i] = rand() % 100;   // this should be % (256*256) 
        values1[i] = rand() % 100; // and % 256
        values2[i] = rand() % 100; // but I use % 100 so that in this example it doesn't overflow int32
    }

    // the size of keys2/values2 is big enough to flush out keys1/values1 from cache completely.
    // so order of execution doesn't affect performance here
    benchMethod(&findBestKey, n, keys1, values1, res1, totalTimeNormal);
    benchMethod(&findBestKey, n, keys2, values2, res1, totalTimeNormal);

    benchMethod(&findBestKeyPro, n, keys1, values1, res2, totalTimePro);
    benchMethod(&findBestKeyPro, n, keys2, values2, res2, totalTimePro);

    benchMethod(&findBestKeyProMax, n, keys1, values1, res2, totalTimeProMax);
    benchMethod(&findBestKeyProMax, n, keys2, values2, res2, totalTimeProMax);

    delete[] keys1;
    delete[] keys2;
    delete[] values1;
    delete[] values2;
}

void testIf()
{
    int res1 = 0, res2 = 0, res3 = 0;
    for (int t = 1; t <= 100; t++) {
        benching(res1, res2, res3);
        res1 %= 100;
        res2 %= 100;
        res3 %= 100;

        cout << "Lap " << t << "\n";
        cout << "time normal = " << totalTimeNormal/1000 << " ms\n";
        cout << "time pro = " << totalTimePro/1000 << " ms\n";
        cout << "time pro max = " << totalTimeProMax/1000 << " ms\n";
        cout << "\n";
    }

    cout << "**********************\n" << res1 << " " << res2 << "\n";
}


int main()
{
    testIf();
    
    return 0;
}

There are two arrays, keys and values, both completely random. This function returns the value that has the minimum key. So: index = indexOfMin(keys); return values[index]; See function findBestKey. I need to fill in findBestKeyProMax

findBestKeyPro is around 30-35% faster than findBestKey, on my computer and on here: https://www.onlinegdb.com/online_c++_compiler . Compiler option is -std=c++14 -O2 Update: I get ~~5-10% more performance just by changing to -O3

Is there anyway I can make this faster? Every nanosecond matters, since this function is called ~~10^6-10^7 times (once for each pixel); saving 1 ns per call would translate to 1ms less, which is the difference between 200fps and 250fps.

Edit: no multi-threading or GPU. It's already done (each thread performs findBestKey on distinct keys/values arrays), so I want to improve this function directly. Maybe something like SIMD for CPU? Or branchless function.

Also the functions findBest... are what matters, function benchMethod() is just for benchmarking.

Edit 2: target architecture is CPUs with AVX256 capability, mainly Intel Skylake or AMD Zen 2.

Comments are not for extended discussion; this conversation has been [moved to chat](https://chat.stackoverflow.com/rooms/219475/discussion-on-question-by-duke-le-most-insanely-efficient-way-to-find-index-of-t). — Samuel Liew, Aug 09 '20 at 03:37
@SamuelLiew: Nuking comments was a bit premature; important extra details provided by the OP hadn't yet been incorporated into the question. It also made helpful ideas that hadn't yet been written up into full answer harder for future readers to find. IDK if moving comments to chat also killed an @ username notification I'd left for Margaret, who was also commenting, before she got the notification. I know it's better to have answers posted as answers, but comments are better than nothing.. — Peter Cordes, Aug 10 '20 at 03:51

Most insanely efficient way to find index of the minimum of four numbers

0 Answers0