I am trying to add cache-line padding to avoid false sharing problem but I cant see a big difference in speedup. With padding its only 1.2 x faster. I am running the code without padding and the one with padding n = 700 milion times for testing. Should I get more speedup than 1.2 times? Maybe I have missed something with my padding implementation? I am adding 15 ints padding because I am assuming that counters doesnt have to be allocated at the start of a cache-line. Any tips appreciated.
Here is my code:
template <const int k> void par_countingsort2(int *out, int const *in, const int n) {
const int paddingAmount = cachelinesize / sizeof(int);
const int kPadded = k + (paddingAmount - 1);
printf("/n%d", kPadded);
int counters[nproc][kPadded] = {}; // all zeros
#pragma omp parallel
{
int *thcounters = counters[omp_get_thread_num()];
#pragma omp for
for (int i = 0; i < n; ++i)
++thcounters[in[i]];
#pragma omp single
{
int tmp, sum = 0;
for (int j = 0; j < k; ++j)
for (int i = 0; i < nproc; ++i) {
tmp = counters[i][j];
counters[i][j] = sum;
sum += tmp;
}
}
#pragma omp for
for (int i = 0; i < n; ++i)
out[thcounters[in[i]]++] = in[i];
}
}
#define k 1000
int main(int argc, char *argv[]) {
//init input
int n = argc>1 && atoi(argv[1])>0 ? atoi(argv[1]) : 0;
int* in = (int*)malloc(sizeof(int)*n);
int* out = (int*)malloc(sizeof(int)*n);;
for (int i = 0; i < n; ++i)
in[i] = rand()%k;
printf("n = %d\n", n);
//print some parameters
printf("nproc = %d\n", nproc);
printf("cachelinesize = %d byte\n", cachelinesize);
printf("k = %d\n", k);
double tp2 = omp_get_wtime();
par_countingsort2<k>(out, in, n);
tp2 = omp_get_wtime() - tp2;
printf("par2, elapsed time = %.3f seconds (%.1fx speedup from par1), check passed = %c\n", tp2, tp/tp2, checkreset(out,in,n)?'y':'n');
//free mem
free(in);
free(out);
return EXIT_SUCCESS;
}