1
#include <cuda_runtime.h>
#include <iostream>
#include <string>
using namespace std;

__device__
void BlockReduce(double val1, double val2, double* shmem1, double* shmem2)
{
  int num_warp = blockDim.x / 32;
  int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;

  for (int offset = 32 / 2; offset > 0; offset /= 2) {
    val1 += __shfl_down_sync(0xffffffff, val1, offset);
    val2 += __shfl_down_sync(0xffffffff, val2, offset);
  }

  __syncthreads();
  if (lane_id == 0) {
    shmem1[warp_id] = val1;
    shmem2[warp_id] = val2;
  }
  __syncthreads();

  if (warp_id == 0) {
    if (threadIdx.x < num_warp) {
      val1 = shmem1[threadIdx.x];
      val2 = shmem2[threadIdx.x];
    } else {
      val1 = 0;
      val2 = 0;
    }

    for (int offset = 32 / 2; offset > 0; offset /= 2) {
      val1 += __shfl_down_sync(0xffffffff, val1, offset);
      val2 += __shfl_down_sync(0xffffffff, val2, offset);
    }

    if (lane_id == 0) {
      shmem1[0] = val1;
      shmem2[0] = val2; 
    }
  }  
}

__device__
void BlockReduce(double val1, double val2, int cnt, double* shmem1, double* shmem2, int* shmem3)
{
  int num_warp = blockDim.x / 32;
  int warp_id = threadIdx.x / 32, lane_id = threadIdx.x % 32;

  for (int offset = 32 / 2; offset > 0; offset /= 2) {
    val1 += __shfl_down_sync(0xffffffff, val1, offset);
    val2 += __shfl_down_sync(0xffffffff, val2, offset);
    cnt += __shfl_down_sync(0xffffffff, cnt, offset);
  }

  __syncthreads();
  if (lane_id == 0) {
    shmem1[warp_id] = val1;
    shmem2[warp_id] = val2;
    shmem3[warp_id] = cnt;
  }
  __syncthreads();

  if (warp_id == 0) {
    if (threadIdx.x < num_warp) {
      val1 = shmem1[threadIdx.x];
      val2 = shmem2[threadIdx.x];
      cnt = shmem3[threadIdx.x];
    } else {
      val1 = 0;
      val2 = 0;
      cnt = 0;
    }

    for (int offset = 32 / 2; offset > 0; offset /= 2) {
      val1 += __shfl_down_sync(0xffffffff, val1, offset);
      val2 += __shfl_down_sync(0xffffffff, val2, offset);
      cnt += __shfl_down_sync(0xffffffff, cnt, offset);
    }

    if (lane_id == 0) {
      shmem1[0] = val1;
      shmem2[0] = val2; 
      shmem3[0] = cnt;
    }
  }
}

template <int Nval>
__device__ BlockReduceN(double* __restrict__ values, double** shmem)
{
  for (int offset = 32 / 2; offset > 0; offset /= 2) {    
    for (int t = 0; t < Nval; t++) values[t] += __shfl_down_sync(0xffffffff, values[t], offset);
  }

  __syncthreads();
  if (lane_id == 0) {
    for (int t = 0; t < Nval; t++) shmem[t][warp_id] = values[t];    
  }
  __syncthreads();

  if (warp_id == 0) {
    if (threadIdx.x < num_warp) {
      for (int t = 0; t < Nval; t++) values[t] = shmem[t][threadIdx.x];      
    } else {
      for (int t = 0; t < Nval; t++) values[t] = 0;
    }

    for (int offset = 32 / 2; offset > 0; offset /= 2) {
      for (int t = 0; t < Nval; t++) values[t] += __shfl_down_sync(0xffffffff, values[t], offset);
    }

    if (lane_id == 0) {
      for (int t = 0; t < Nval; t++) shmem[t][0] = values[t];      
    }
  }
}

int main()
{

}

I need to use reduction in many math operations. The number of input variables are different, for example:

  1. Find mean/stddev of array -> need to calculate sum(X), sum(X^2)
  2. Same as 1 but ignore NAN values -> need to calculate sum(X), sum(X^2), sum(!isnan(X))

Let's assume the number of input variables is in range 1->4

An obvious solution is to write 1 function for each number of input, so 4 in total. But there's a lot of duplicated code.

Is there any way to use C macro or C++ template to solve this more elegantly ? Note that the input parameters might not have the same type (in the second BlockReduce, the inputs represent sumx and sumx_sqr of type double, and valid_count of type int)

Edit: in case all variables have the same type, it's possible to put them into 1 array, and use template for array length. I'm still looking for a solution that can handle different value types (if possible)

Huy Le
  • 1,439
  • 4
  • 19
  • 1
    [`cub::BlockReduce`](https://nvlabs.github.io/cub/classcub_1_1_block_reduce.html) – paleonix May 09 '23 at 13:23
  • 1
    To do multiple reductions at once as in your code with CUB you could e.g. use CUDA's vector types like `double2` and define your won `double2 operator+(const double2 &left, const double2 &right)`. – paleonix May 09 '23 at 13:36
  • 1
    Combining different types using a custom class or `tuple` should also work. – paleonix May 09 '23 at 14:08

0 Answers0