I am trying to implement sum reduction using CUDA, however I want the reduction to be to the right not to the left.. I wrote the below code, but I am not sure why it is not working
__global__ void reduce_kernel(
float *input,
float *partialSums,
unsigned int N)
{
unsigned int segment = blockIdx.x * blockDim.x * 2;
unsigned int i = segment + threadIdx.x;
__shared__ float input_s[BLOCK_DIM];
input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
int count = 2;
__syncthreads();
for (unsigned int stride = BLOCK_DIM / 2;
stride < BLOCK_DIM;
stride = stride + (BLOCK_DIM / count))
{
if (threadIdx.x >= stride) {
count = count * 2;
input_s[threadIdx.x] += input_s[threadIdx.x - stride];
printf("%d ", stride);
__syncthreads();
if (stride == BLOCK_DIM - 1) {
break;
}
}
__syncthreads();
}
if (threadIdx.x == BLOCK_DIM - 1) {
partialSums[blockIdx.x] = input_s[threadIdx.x];
}
}
Any ideas what I am doing wrong?