#include <cuda_runtime.h>
#include <iostream>
#include <string>
using namespace std;
__device__
void BlockReduce(double val1, double val2, double* shmem1, double* shmem2)
{
int num_warp = blockDim.x / 32;
int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
for (int offset = 32 / 2; offset > 0; offset /= 2) {
val1 += __shfl_down_sync(0xffffffff, val1, offset);
val2 += __shfl_down_sync(0xffffffff, val2, offset);
}
__syncthreads();
if (lane_id == 0) {
shmem1[warp_id] = val1;
shmem2[warp_id] = val2;
}
__syncthreads();
if (warp_id == 0) {
if (threadIdx.x < num_warp) {
val1 = shmem1[threadIdx.x];
val2 = shmem2[threadIdx.x];
} else {
val1 = 0;
val2 = 0;
}
for (int offset = 32 / 2; offset > 0; offset /= 2) {
val1 += __shfl_down_sync(0xffffffff, val1, offset);
val2 += __shfl_down_sync(0xffffffff, val2, offset);
}
if (lane_id == 0) {
shmem1[0] = val1;
shmem2[0] = val2;
}
}
}
__device__
void BlockReduce(double val1, double val2, int cnt, double* shmem1, double* shmem2, int* shmem3)
{
int num_warp = blockDim.x / 32;
int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;
for (int offset = 32 / 2; offset > 0; offset /= 2) {
val1 += __shfl_down_sync(0xffffffff, val1, offset);
val2 += __shfl_down_sync(0xffffffff, val2, offset);
cnt += __shfl_down_sync(0xffffffff, cnt, offset);
}
__syncthreads();
if (lane_id == 0) {
shmem1[warp_id] = val1;
shmem2[warp_id] = val2;
shmem3[warp_id] = cnt;
}
__syncthreads();
if (warp_id == 0) {
if (threadIdx.x < num_warp) {
val1 = shmem1[threadIdx.x];
val2 = shmem2[threadIdx.x];
cnt = shmem3[threadIdx.x];
} else {
val1 = 0;
val2 = 0;
cnt = 0;
}
for (int offset = 32 / 2; offset > 0; offset /= 2) {
val1 += __shfl_down_sync(0xffffffff, val1, offset);
val2 += __shfl_down_sync(0xffffffff, val2, offset);
cnt += __shfl_down_sync(0xffffffff, cnt, offset);
}
if (lane_id == 0) {
shmem1[0] = val1;
shmem2[0] = val2;
shmem3[0] = cnt;
}
}
}
template <int Nval>
__device__ BlockReduceN(double* __restrict__ values, double** shmem)
{
for (int offset = 32 / 2; offset > 0; offset /= 2) {
for (int t = 0; t < Nval; t++) values[t] += __shfl_down_sync(0xffffffff, values[t], offset);
}
__syncthreads();
if (lane_id == 0) {
for (int t = 0; t < Nval; t++) shmem[t][warp_id] = values[t];
}
__syncthreads();
if (warp_id == 0) {
if (threadIdx.x < num_warp) {
for (int t = 0; t < Nval; t++) values[t] = shmem[t][threadIdx.x];
} else {
for (int t = 0; t < Nval; t++) values[t] = 0;
}
for (int offset = 32 / 2; offset > 0; offset /= 2) {
for (int t = 0; t < Nval; t++) values[t] += __shfl_down_sync(0xffffffff, values[t], offset);
}
if (lane_id == 0) {
for (int t = 0; t < Nval; t++) shmem[t][0] = values[t];
}
}
}
int main()
{
}
I need to use reduction in many math operations. The number of input variables are different, for example:
- Find mean/stddev of array -> need to calculate sum(X), sum(X^2)
- Same as 1 but ignore NAN values -> need to calculate sum(X), sum(X^2), sum(!isnan(X))
Let's assume the number of input variables is in range 1->4
An obvious solution is to write 1 function for each number of input, so 4 in total. But there's a lot of duplicated code.
Is there any way to use C macro or C++ template to solve this more elegantly ? Note that the input parameters might not have the same type (in the second BlockReduce
, the inputs represent sumx
and sumx_sqr
of type double
, and valid_count
of type int
)
Edit: in case all variables have the same type, it's possible to put them into 1 array, and use template for array length. I'm still looking for a solution that can handle different value types (if possible)