#include <iostream>
#include <chrono>
#include <random>
using namespace std;
class MyTimer
{
private:
std::chrono::time_point<std::chrono::steady_clock> starter;
std::chrono::time_point<std::chrono::steady_clock> ender;
public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}
long long getCounter() {
ender = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(ender - starter).count();
}
};
int findBestKey(int keys[4], int values[4])
{
int index = 0;
for (int i = 1; i <= 3; i++)
if (keys[index] > keys[i])
index = i;
return values[index];
}
int findBestKeyPro(int keys[4], int values[4])
{
int index = keys[0] > keys[1];
if (keys[index] > keys[2]) index = 2;
if (keys[index] > keys[3]) return values[3];
else return values[index];
}
int findBestKeyProMax(int keys[4], int values[4])
{
// fill your implementation here. Not necessary to read the parts below
return 0;
}
void benchMethod(int (*findBestKeyFunc)(int keys[4], int values[4]), int n, int* keys, int* values, int& res, double& totalTime)
{
MyTimer timer;
timer.startCounter();
// In my actual problems, values of arrays "keys" are completely unrelated. They are not the same continuous values in memory. The line below is just an example for benchmark purposes
for (int i = 0; i < n - 4; i+=4)
res += findBestKeyFunc(&keys[i], &values[i]);
totalTime += timer.getCounter();
/*
it is possible to calculate 4 arrays "keys","values", then process them all at once.
for (int i=0; i<n-4; i+=16)
{
keys[4][4] = ...; values[4][4] = ...;
res += find4BestKeyAtOnce(&keys, &values);
}
*/
}
double totalTimeNormal = 0, totalTimePro = 0, totalTimeProMax = 0;
void benching(int& res1, int& res2, int& res3)
{
const int n = 10000000;
int* keys1 = new int[n], * values1 = new int[n];
int* keys2 = new int[n], * values2 = new int[n];
MyTimer timer;
double tmp;
for (int i = 0; i < n; i++) {
keys1[i] = rand() % 100; // need 2 arrays to prevent caching
keys2[i] = rand() % 100; // this should be % (256*256)
values1[i] = rand() % 100; // and % 256
values2[i] = rand() % 100; // but I use % 100 so that in this example it doesn't overflow int32
}
// the size of keys2/values2 is big enough to flush out keys1/values1 from cache completely.
// so order of execution doesn't affect performance here
benchMethod(&findBestKey, n, keys1, values1, res1, totalTimeNormal);
benchMethod(&findBestKey, n, keys2, values2, res1, totalTimeNormal);
benchMethod(&findBestKeyPro, n, keys1, values1, res2, totalTimePro);
benchMethod(&findBestKeyPro, n, keys2, values2, res2, totalTimePro);
benchMethod(&findBestKeyProMax, n, keys1, values1, res2, totalTimeProMax);
benchMethod(&findBestKeyProMax, n, keys2, values2, res2, totalTimeProMax);
delete[] keys1;
delete[] keys2;
delete[] values1;
delete[] values2;
}
void testIf()
{
int res1 = 0, res2 = 0, res3 = 0;
for (int t = 1; t <= 100; t++) {
benching(res1, res2, res3);
res1 %= 100;
res2 %= 100;
res3 %= 100;
cout << "Lap " << t << "\n";
cout << "time normal = " << totalTimeNormal/1000 << " ms\n";
cout << "time pro = " << totalTimePro/1000 << " ms\n";
cout << "time pro max = " << totalTimeProMax/1000 << " ms\n";
cout << "\n";
}
cout << "**********************\n" << res1 << " " << res2 << "\n";
}
int main()
{
testIf();
return 0;
}
There are two arrays, keys
and values
, both completely random. This function returns the value that has the minimum key. So: index = indexOfMin(keys); return values[index];
See function findBestKey
. I need to fill in findBestKeyProMax
findBestKeyPro
is around 30-35% faster than findBestKey
, on my computer and on here: https://www.onlinegdb.com/online_c++_compiler . Compiler option is -std=c++14 -O2
Update: I get ~~5-10% more performance just by changing to -O3
Is there anyway I can make this faster? Every nanosecond matters, since this function is called ~~10^6-10^7 times (once for each pixel); saving 1 ns per call would translate to 1ms less, which is the difference between 200fps and 250fps.
Edit: no multi-threading or GPU. It's already done (each thread performs findBestKey
on distinct keys/values arrays), so I want to improve this function directly. Maybe something like SIMD for CPU? Or branchless function.
Also the functions findBest...
are what matters, function benchMethod()
is just for benchmarking.
Edit 2: target architecture is CPUs with AVX256 capability, mainly Intel Skylake or AMD Zen 2.