I am trying to implement my own reduce sum for big 1D arrays. I came up with a reduce kernel and a way to call the kernel several times for step by step reduction to reach a single value. Now I know that this is not the optimal way of computing this (if you see my code it may get to a point where a kernel call needs to be made to add 3 values), but let's obviate that for a moment and try to work with this.
Basically, in short: I call the reduce kernel to each time reduce MAXTHREADS
times, in this case 1024. So the size of the array will be reduced by 1024 each time. When the size is smaller than 1024 it seems that works properly, but for bigger values it miserably fails to get the right sum.
This happens with all of the array sizes I tried. What am I missing?
I will also gladly accept comments about the quality of the code but mainly interested in fixing it.
Below I have posted the whole kernel and a snippet of the kernel call. If I have missed some relevant part of the kernel call, please comment and I will fix it.
The original code has error checks, all kernels run alway and are never returning CUDAError
s.
Reduce kernel
__global__ void reduce6(float *g_idata, float *g_odata, unsigned int n){
extern __shared__ float sdata[];
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(MAXTREADS*2) + tid;
unsigned int gridSize = MAXTREADS*2*gridDim.x;
//blockSize==MAXTHREADS
sdata[tid] = 0;
float mySum = 0;
if (tid>=n) return;
if ( (gridDim.x>0) & ((i+MAXTREADS)<n)){
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+MAXTREADS];
i += gridSize;
}
}else{
sdata[tid] += g_idata[i];
}
__syncthreads();
// do reduction in shared mem
if (tid < 512)
sdata[tid] += sdata[tid + 512];
__syncthreads();
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
if ( tid < 32 )
{
// Fetch final intermediate sum from 2nd warp
mySum = sdata[tid]+ sdata[tid + 32];
// Reduce final warp using shuffle
for (int offset = warpSize/2; offset > 0; offset /= 2)
mySum += __shfl_down(mySum, offset);
}
sdata[0]=mySum;
#else
// fully unroll reduction within a single warp
if (tid < 32) {
warpReduce(sdata,tid);
}
#endif
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
__device__ void warpReduce(volatile float *sdata,unsigned int tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
Call the kernel for an arbitrary size of total_pixels
:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#defineMAXTREADS 1024
__global__ void initKernel(float * devPtr, const float val, const size_t nwords)
{
//https://stackoverflow.com/questions/10589925/initialize-device-array-in-cuda
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
int stride = blockDim.x * gridDim.x;
for (; tidx < nwords; tidx += stride)
devPtr[tidx] = val;
}
int main()
{
size_t total_pixels = 5000;
unsigned long int n = (unsigned long int)total_pixels;
float* d_image, *d_aux;
float sum;
cudaMalloc(&d_image, total_pixels*sizeof(float));
cudaMalloc(&d_aux, sizeof(float)*(n + MAXTREADS - 1) / MAXTREADS);
for (int i = 0; i < 10; i++){
sum = 0;
cudaMemset(&d_image, 1, total_pixels*sizeof(float));
int dimblockRed = MAXTREADS;
int dimgridRed = (total_pixels + MAXTREADS - 1) / MAXTREADS;
int reduceCont = 0;
initKernel << < dimgridRed, dimblockRed >> >(d_image, 1.0, total_pixels);
while (dimgridRed > 1) {
if (reduceCont % 2 == 0){
reduce6 << <dimgridRed, dimblockRed, MAXTREADS*sizeof(float) >> >(d_image, d_aux, n);
}
else{
reduce6 << <dimgridRed, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);
}
n = dimgridRed;
dimgridRed = (n + MAXTREADS - 1) / MAXTREADS;
reduceCont++;
}
if (reduceCont % 2 == 0){
reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_image, d_aux, n);
cudaMemcpy(&sum, d_aux, sizeof(float), cudaMemcpyDeviceToHost);
}
else{
reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);
cudaMemcpy(&sum, d_image, sizeof(float), cudaMemcpyDeviceToHost);
}
printf("%f ", sum);
}
cudaDeviceReset();
return 0;
}