Particle cell counting (1D and 2D histogram generation) in CUDA

Question

This __global__ function is to increment number and counting how many particles are in some cells.

__global__ void Set_Nc_GPU_0831(int *nc,int *index,SP DSMC)
{
    int tidx;
    tidx=threadIdx.x+blockDim.x*blockIdx.x;

    atomicAdd(&nc[index[tidx]],1);
}

I'm using atomic operations which are slow. So I want to replace the atomic function with some other functions or algorithms.

Is there any alternative to modify this simple __global__ function?

The canonical solution is to use partial arrays in shared memory so that you can do fast accumulation without atomic operations and then at the end you can just sum all these partial arrays. Of course this assumes that `nc` will fit in shared memory, but you haven't told us much about the sizes of your data structures. — Paul R, Aug 31 '12 at 10:05
I think reviewing parallel reduction would be helpful for you.(contains code snippets) - http://people.maths.ox.ac.uk/gilesm/cuda/lecs/lec4.pdf — Sayan, Sep 04 '12 at 19:05

score 3 · Answer 1 · answered Jun 01 '14 at 22:06

This is a late answer provided to remove this question from the unanswered list.

You have recognize that counting how many particles fall within a certain cell is equivalent to constructing an histogram. The construction of an histogram is a well studied problem. The book by Shane Cook (CUDA Programming) contains a good discussion on this topic. Furthermore, the CUDA samples contain an histogram example. Moreover, an histogram construction by CUDA Thrust is also possible. Finally, the CUDA Programming Blog contains some more insight.

Below I'm providing a code to compare 5 different versions of histogram calculations:

CPU;
GPU with atomics (basically your approach);
GPU with atomics in shared memory with final summation of partial histograms (basically, the approach proposed by Paul R);
GPU by using CUDA Thrust.

If you run the code for the typical case of 10MB of data on a Kepler K20c, you get the following timings:

CPU = 83ms;
GPU with atomics = 15.8ms;
GPU with atomics in shared memory = 17.7ms;
GPU by CUDA Thrust = 40ms.

As you can see, and surprisingly, your "brute-force" solution is the fastest. This can be justified since for the latest architectures (your post is dated Aug. 2012 when Kepler was not yet issued, at least in Europe), atomic operations are very fast.

Here is the code:

#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/generate.h>
#include <thrust/adjacent_difference.h>
#include <thrust/binary_search.h>

#define SIZE (100*1024*1024) // 100 MB

/**********************************************/
/* FUNCTION TO GENERATE RANDOM UNSIGNED CHARS */
/**********************************************/
unsigned char* big_random_block(int size) {
    unsigned char *data = (unsigned char*)malloc(size);
    for (int i=0; i<size; i++)
        data[i] = rand();
    return data;
}

/***************************************/
/* GPU HISTOGRAM CALCULATION VERSION 1 */
/***************************************/
__global__ void histo_kernel1(unsigned char *buffer, long size, unsigned int *histo ) {

    // --- The number of threads does not cover all the data size
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    while (i < size) {
        atomicAdd(&histo[buffer[i]], 1);
        i += stride;
    }
}

/***************************************/
/* GPU HISTOGRAM CALCULATION VERSION 2 */
/***************************************/
__global__ void histo_kernel2(unsigned char *buffer, long size, unsigned int *histo ) {

    // --- Allocating and initializing shared memory to store partial histograms
    __shared__ unsigned int temp[256];
    temp[threadIdx.x] = 0;
    __syncthreads();

    // --- The number of threads does not cover all the data size
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int offset = blockDim.x * gridDim.x;
    while (i < size)
    {
        atomicAdd(&temp[buffer[i]], 1);
        i += offset;
    }
    __syncthreads();

    // --- Summing histograms
    atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/********/
/* MAIN */
/********/
void main(){

    // --- Generating an array of SIZE unsigned chars
    unsigned char *buffer = (unsigned char*)big_random_block(SIZE);

    /********************/
    /* CPU COMPUTATIONS */
    /********************/

    // --- Allocating host memory space and initializing the host-side histogram
    unsigned int histo[256];
    for (int i=0; i<256; i++) histo [i] = 0;

    clock_t start_CPU, stop_CPU;

    // --- Histogram calculation on the host
    start_CPU = clock();
    for (int i=0; i<SIZE; i++) histo [buffer[i]]++;
    stop_CPU = clock();
    float elapsedTime = (float)(stop_CPU - start_CPU) / (float)CLOCKS_PER_SEC * 1000.0f;
    printf("Time to generate (CPU): %3.1f ms\n", elapsedTime);

    // --- Indirect check of the result
    long histoCount = 0;
    for (int i=0; i<256; i++) { histoCount += histo[i]; }
    printf("Histogram Sum: %ld\n", histoCount);

    /********************/
    /* GPU COMPUTATIONS */
    /********************/

    // --- Initializing the device-side data
    unsigned char *dev_buffer;
    gpuErrchk(cudaMalloc((void**)&dev_buffer,SIZE));
    gpuErrchk(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice));

    // --- Allocating device memory space for the device-side histogram
    unsigned int *dev_histo;
    gpuErrchk(cudaMalloc((void**)&dev_histo,256*sizeof(long)));

    // --- GPU timing
    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    // --- ATOMICS
    // --- Histogram calculation on the device - 2x the number of multiprocessors gives best timing
    gpuErrchk(cudaEventRecord(start,0));
    gpuErrchk(cudaMemset(dev_histo,0,256*sizeof(int)));
    cudaDeviceProp prop;
    gpuErrchk(cudaGetDeviceProperties(&prop,0));
    int blocks = prop.multiProcessorCount;
    histo_kernel1<<<blocks*2,256>>>(dev_buffer, SIZE, dev_histo);

    gpuErrchk(cudaMemcpy(histo,dev_histo,256*sizeof(int),cudaMemcpyDeviceToHost));
    gpuErrchk(cudaEventRecord(stop,0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop));
    printf("Time to generate (GPU):  %3.1f ms\n", elapsedTime);

    histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    // --- Check the correctness of the results via the host
    for (int i=0; i<SIZE; i++) histo[buffer[i]]--;
    for (int i=0; i<256; i++) {
        if (histo[i] != 0) printf( "Failure at %d!  Off by %d\n", i, histo[i] );
}

    // --- ATOMICS IN SHARED MEMORY
    // --- Histogram calculation on the device - 2x the number of multiprocessors gives best timing
    gpuErrchk(cudaEventRecord(start,0));
    gpuErrchk(cudaMemset(dev_histo,0,256*sizeof(int)));
    gpuErrchk(cudaGetDeviceProperties(&prop,0));
    blocks = prop.multiProcessorCount;
    histo_kernel2<<<blocks*2,256>>>(dev_buffer, SIZE, dev_histo);

    gpuErrchk(cudaMemcpy(histo,dev_histo,256*sizeof(int),cudaMemcpyDeviceToHost));
    gpuErrchk(cudaEventRecord(stop,0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop));
    printf("Time to generate (GPU):  %3.1f ms\n", elapsedTime);

    histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    // --- Check the correctness of the results via the host
    for (int i=0; i<SIZE; i++) histo[buffer[i]]--;
    for (int i=0; i<256; i++) {
        if (histo[i] != 0) printf( "Failure at %d!  Off by %d\n", i, histo[i] );
    }

    // --- CUDA THRUST

    gpuErrchk(cudaEventRecord(start,0));

    // --- Wrapping dev_buffer raw pointer with a device_ptr and initializing a device_vector with it
    thrust::device_ptr<unsigned char> dev_ptr(dev_buffer);
    thrust::device_vector<unsigned char> dev_buffer_thrust(dev_ptr, dev_ptr + SIZE);

    // --- Sorting data to bring equal elements together
    thrust::sort(dev_buffer_thrust.begin(), dev_buffer_thrust.end());

    // - The number of histogram bins is equal to the maximum value plus one
    int num_bins = dev_buffer_thrust.back() + 1;

    // --- Resize histogram storage
    thrust::device_vector<int> d_histogram;
    d_histogram.resize(num_bins);

    // --- Find the end of each bin of values
    thrust::counting_iterator<int> search_begin(0);
    thrust::upper_bound(dev_buffer_thrust.begin(), dev_buffer_thrust.end(),
                    search_begin, search_begin + num_bins,
                    d_histogram.begin());

    // --- Compute the histogram by taking differences of the cumulative histogram
    thrust::adjacent_difference(d_histogram.begin(), d_histogram.end(),
                            d_histogram.begin());

    thrust::host_vector<int> h_histogram(d_histogram);
    gpuErrchk(cudaEventRecord(stop,0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop));
    printf("Time to generate (GPU):  %3.1f ms\n", elapsedTime);

    histoCount = 0;
    for (int i=0; i<256; i++) {
        histoCount += h_histogram[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    // --- Check the correctness of the results via the host
    for (int i=0; i<SIZE; i++) h_histogram[buffer[i]]--;
    for (int i=0; i<256; i++) {
        if (h_histogram[i] != 0) printf( "Failure at %d!  Off by %d\n", i, h_histogram[i] );
    }

    gpuErrchk(cudaEventDestroy(start));
    gpuErrchk(cudaEventDestroy(stop));
    gpuErrchk(cudaFree(dev_histo));
    gpuErrchk(cudaFree(dev_buffer));

    free(buffer);

    getchar();

}

:Hello and thanks for the code.I was wondering if you can provide the case ( histo_kernel2) where we use 2 dimensions for threads and blocks.int y = threadIdx.y + blockIdx.y * blockDim.y; int x = threadIdx.x + blockIdx.x * blockDim.x;.Then , the offset will be?Also, are we going to use temp[ globalID ] instead of temp[ threadIdx.x]?And at the while statement? while ( globalID < N ) instead of while ( i < N )?If you can answer , I 'll appreciate , thanks!If you can't it's ok , again thanks! — George, Nov 27 '14 at 15:25
@George I'll try to provide the 2D case for this weekend. Sorry, but it's somewhat a hard time right now :-) — Vitality, Nov 27 '14 at 22:45
:No problem of course!Thank you very much for your help and for your time , really! — George, Nov 28 '14 at 07:53
@George As promised, I have added a new answer, coping with the 2D case. — Vitality, Nov 30 '14 at 22:22

score 3 · Answer 2 · answered Nov 30 '14 at 22:20

Following George's comment, with this answer I'm coping with the 2D case in which the particles do not lay on a line, but on a portion of a plane.

Actually, the 2D case requires only minor modifications of the code presented above. The only thing to do is to define x and y coordinates of the particles and a 2D, 256 x 256 histogram.

The code below provides a full worked example.

#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/generate.h>
#include <thrust/adjacent_difference.h>
#include <thrust/binary_search.h>

#define SIZE (100*1024*1024) // 100 MB

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********************************************/
/* FUNCTION TO GENERATE RANDOM UNSIGNED CHARS */
/**********************************************/
unsigned char* big_random_block(int size) {
    unsigned char *data = (unsigned char*)malloc(size);
    for (int i=0; i<size; i++)
        data[i] = rand();
    return data;
}

/********************************/
/* GPU HISTOGRAM CALCULATION 2D */
/********************************/
__global__ void histo_kernel2(unsigned char *dev_x_coord, unsigned char *dev_y_coord, unsigned int *histo, unsigned int size) {

    // --- The number of threads does not cover all the data size
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int offset = blockDim.x * gridDim.x;
    while (i < size)
    {
        atomicAdd(&histo[dev_y_coord[i] * 256 + dev_x_coord[i]], 1);
        i += offset;
    }
}

/********/
/* MAIN */
/********/
void main(){

    // --- Generating x- and y- coordinates of the particles
    unsigned char *x_coord = (unsigned char*)big_random_block(SIZE);
    unsigned char *y_coord = (unsigned char*)big_random_block(SIZE);

    /********************/
    /* CPU COMPUTATIONS */
    /********************/

    // --- Allocating host memory space and initializing the host-side histogram
    unsigned int *histo = (unsigned int*)malloc(256 * 256 * sizeof(unsigned int));
    for (int i=0; i < 256 * 256; i++) histo [i] = 0;

    clock_t start_CPU, stop_CPU;

    // --- Histogram calculation on the host
    start_CPU = clock();
    for (int i=0; i < SIZE; i++) histo[y_coord[i] * 256 + x_coord[i]]++;
    stop_CPU = clock();
    float elapsedTime = (float)(stop_CPU - start_CPU) / (float)CLOCKS_PER_SEC * 1000.0f;
    printf("Time to generate (CPU): %3.1f ms\n", elapsedTime);

    // --- Indirect check of the result
    long histoCount = 0;
    for (int i=0; i < 256 * 256; i++) { histoCount += histo[i]; }
    printf("Histogram Sum: %ld\n", histoCount);

    /********************/
    /* GPU COMPUTATIONS */
    /********************/

    // --- Initializing the device-side data
    unsigned char *dev_x_coord, *dev_y_coord;
    gpuErrchk(cudaMalloc((void**)&dev_x_coord,SIZE));
    gpuErrchk(cudaMalloc((void**)&dev_y_coord,SIZE));
    gpuErrchk(cudaMemcpy(dev_x_coord, x_coord, SIZE, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(dev_y_coord, y_coord, SIZE, cudaMemcpyHostToDevice));

    // --- Allocating device memory space for the device-side histogram
    unsigned int *dev_histo;
    gpuErrchk(cudaMalloc((void**)&dev_histo,256*256*sizeof(unsigned int)));

    // --- GPU timing
    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    // --- ATOMICS
    gpuErrchk(cudaEventRecord(start,0));
    gpuErrchk(cudaMemset(dev_histo,0,256*256*sizeof(unsigned int)));
    cudaDeviceProp prop;
    gpuErrchk(cudaGetDeviceProperties(&prop,0));

    int blocks = prop.multiProcessorCount;
    histo_kernel2<<<blocks*2,256>>>(dev_x_coord, dev_y_coord, dev_histo, SIZE);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(histo, dev_histo, 256 * 256 * sizeof(unsigned int),cudaMemcpyDeviceToHost));
    gpuErrchk(cudaEventRecord(stop,0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop));
    printf("Time to generate (GPU):  %3.1f ms\n", elapsedTime);

    histoCount = 0;
    for (int i=0; i<256 * 256; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    // --- Check the correctness of the results via the host
    for (int i=0; i<SIZE; i++) histo[y_coord[i] * 256 + x_coord[i]]--;
    for (int i=0; i<256*256; i++) {
        if (histo[i] != 0) printf( "Failure at %d!  Off by %d\n", i, histo[i] );
    }

}

:Thank you very much!Nice thought using the dev_x_coord and dev_y.Regarding my previous comment,is it possible?Is my comment right?Thanks again! — George, Dec 01 '14 at 08:17

Particle cell counting (1D and 2D histogram generation) in CUDA

2 Answers2