This is a late answer provided to remove this question from the unanswered list.
You have recognize that counting how many particles fall within a certain cell is equivalent to constructing an histogram. The construction of an histogram is a well studied problem. The book by Shane Cook (CUDA Programming) contains a good discussion on this topic. Furthermore, the CUDA samples contain an histogram example. Moreover, an histogram construction by CUDA Thrust is also possible. Finally, the CUDA Programming Blog contains some more insight.
Below I'm providing a code to compare 5 different versions of histogram calculations:
- CPU;
- GPU with atomics (basically your approach);
- GPU with atomics in shared memory with final summation of partial histograms (basically, the approach proposed by Paul R);
- GPU by using CUDA Thrust.
If you run the code for the typical case of 10MB of data on a Kepler K20c, you get the following timings:
- CPU =
83ms
;
- GPU with atomics =
15.8ms
;
- GPU with atomics in shared memory =
17.7ms
;
- GPU by CUDA Thrust =
40ms
.
As you can see, and surprisingly, your "brute-force" solution is the fastest. This can be justified since for the latest architectures (your post is dated Aug. 2012 when Kepler was not yet issued, at least in Europe), atomic operations are very fast.
Here is the code:
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/generate.h>
#include <thrust/adjacent_difference.h>
#include <thrust/binary_search.h>
#define SIZE (100*1024*1024) // 100 MB
/**********************************************/
/* FUNCTION TO GENERATE RANDOM UNSIGNED CHARS */
/**********************************************/
unsigned char* big_random_block(int size) {
unsigned char *data = (unsigned char*)malloc(size);
for (int i=0; i<size; i++)
data[i] = rand();
return data;
}
/***************************************/
/* GPU HISTOGRAM CALCULATION VERSION 1 */
/***************************************/
__global__ void histo_kernel1(unsigned char *buffer, long size, unsigned int *histo ) {
// --- The number of threads does not cover all the data size
int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (i < size) {
atomicAdd(&histo[buffer[i]], 1);
i += stride;
}
}
/***************************************/
/* GPU HISTOGRAM CALCULATION VERSION 2 */
/***************************************/
__global__ void histo_kernel2(unsigned char *buffer, long size, unsigned int *histo ) {
// --- Allocating and initializing shared memory to store partial histograms
__shared__ unsigned int temp[256];
temp[threadIdx.x] = 0;
__syncthreads();
// --- The number of threads does not cover all the data size
int i = threadIdx.x + blockIdx.x * blockDim.x;
int offset = blockDim.x * gridDim.x;
while (i < size)
{
atomicAdd(&temp[buffer[i]], 1);
i += offset;
}
__syncthreads();
// --- Summing histograms
atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
void main(){
// --- Generating an array of SIZE unsigned chars
unsigned char *buffer = (unsigned char*)big_random_block(SIZE);
/********************/
/* CPU COMPUTATIONS */
/********************/
// --- Allocating host memory space and initializing the host-side histogram
unsigned int histo[256];
for (int i=0; i<256; i++) histo [i] = 0;
clock_t start_CPU, stop_CPU;
// --- Histogram calculation on the host
start_CPU = clock();
for (int i=0; i<SIZE; i++) histo [buffer[i]]++;
stop_CPU = clock();
float elapsedTime = (float)(stop_CPU - start_CPU) / (float)CLOCKS_PER_SEC * 1000.0f;
printf("Time to generate (CPU): %3.1f ms\n", elapsedTime);
// --- Indirect check of the result
long histoCount = 0;
for (int i=0; i<256; i++) { histoCount += histo[i]; }
printf("Histogram Sum: %ld\n", histoCount);
/********************/
/* GPU COMPUTATIONS */
/********************/
// --- Initializing the device-side data
unsigned char *dev_buffer;
gpuErrchk(cudaMalloc((void**)&dev_buffer,SIZE));
gpuErrchk(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice));
// --- Allocating device memory space for the device-side histogram
unsigned int *dev_histo;
gpuErrchk(cudaMalloc((void**)&dev_histo,256*sizeof(long)));
// --- GPU timing
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
// --- ATOMICS
// --- Histogram calculation on the device - 2x the number of multiprocessors gives best timing
gpuErrchk(cudaEventRecord(start,0));
gpuErrchk(cudaMemset(dev_histo,0,256*sizeof(int)));
cudaDeviceProp prop;
gpuErrchk(cudaGetDeviceProperties(&prop,0));
int blocks = prop.multiProcessorCount;
histo_kernel1<<<blocks*2,256>>>(dev_buffer, SIZE, dev_histo);
gpuErrchk(cudaMemcpy(histo,dev_histo,256*sizeof(int),cudaMemcpyDeviceToHost));
gpuErrchk(cudaEventRecord(stop,0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop));
printf("Time to generate (GPU): %3.1f ms\n", elapsedTime);
histoCount = 0;
for (int i=0; i<256; i++) {
histoCount += histo[i];
}
printf( "Histogram Sum: %ld\n", histoCount );
// --- Check the correctness of the results via the host
for (int i=0; i<SIZE; i++) histo[buffer[i]]--;
for (int i=0; i<256; i++) {
if (histo[i] != 0) printf( "Failure at %d! Off by %d\n", i, histo[i] );
}
// --- ATOMICS IN SHARED MEMORY
// --- Histogram calculation on the device - 2x the number of multiprocessors gives best timing
gpuErrchk(cudaEventRecord(start,0));
gpuErrchk(cudaMemset(dev_histo,0,256*sizeof(int)));
gpuErrchk(cudaGetDeviceProperties(&prop,0));
blocks = prop.multiProcessorCount;
histo_kernel2<<<blocks*2,256>>>(dev_buffer, SIZE, dev_histo);
gpuErrchk(cudaMemcpy(histo,dev_histo,256*sizeof(int),cudaMemcpyDeviceToHost));
gpuErrchk(cudaEventRecord(stop,0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop));
printf("Time to generate (GPU): %3.1f ms\n", elapsedTime);
histoCount = 0;
for (int i=0; i<256; i++) {
histoCount += histo[i];
}
printf( "Histogram Sum: %ld\n", histoCount );
// --- Check the correctness of the results via the host
for (int i=0; i<SIZE; i++) histo[buffer[i]]--;
for (int i=0; i<256; i++) {
if (histo[i] != 0) printf( "Failure at %d! Off by %d\n", i, histo[i] );
}
// --- CUDA THRUST
gpuErrchk(cudaEventRecord(start,0));
// --- Wrapping dev_buffer raw pointer with a device_ptr and initializing a device_vector with it
thrust::device_ptr<unsigned char> dev_ptr(dev_buffer);
thrust::device_vector<unsigned char> dev_buffer_thrust(dev_ptr, dev_ptr + SIZE);
// --- Sorting data to bring equal elements together
thrust::sort(dev_buffer_thrust.begin(), dev_buffer_thrust.end());
// - The number of histogram bins is equal to the maximum value plus one
int num_bins = dev_buffer_thrust.back() + 1;
// --- Resize histogram storage
thrust::device_vector<int> d_histogram;
d_histogram.resize(num_bins);
// --- Find the end of each bin of values
thrust::counting_iterator<int> search_begin(0);
thrust::upper_bound(dev_buffer_thrust.begin(), dev_buffer_thrust.end(),
search_begin, search_begin + num_bins,
d_histogram.begin());
// --- Compute the histogram by taking differences of the cumulative histogram
thrust::adjacent_difference(d_histogram.begin(), d_histogram.end(),
d_histogram.begin());
thrust::host_vector<int> h_histogram(d_histogram);
gpuErrchk(cudaEventRecord(stop,0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop));
printf("Time to generate (GPU): %3.1f ms\n", elapsedTime);
histoCount = 0;
for (int i=0; i<256; i++) {
histoCount += h_histogram[i];
}
printf( "Histogram Sum: %ld\n", histoCount );
// --- Check the correctness of the results via the host
for (int i=0; i<SIZE; i++) h_histogram[buffer[i]]--;
for (int i=0; i<256; i++) {
if (h_histogram[i] != 0) printf( "Failure at %d! Off by %d\n", i, h_histogram[i] );
}
gpuErrchk(cudaEventDestroy(start));
gpuErrchk(cudaEventDestroy(stop));
gpuErrchk(cudaFree(dev_histo));
gpuErrchk(cudaFree(dev_buffer));
free(buffer);
getchar();
}