0

I have an issue with a simple CUDA code to produce a histogram:

__#include <math.h>
#include <numeric>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

__global__ void kernel_histogram(int* dev_histogram, int* dev_values_arr, unsigned int size) {

    __shared__ int temp[BLOCK_SIZE + 1];
    int thread_id, thread_value;

    thread_id = threadIdx.x + blockIdx.x * blockDim.x;

    if (thread_id >= size) {
        return;
    }

    temp[threadIdx.x + 1] = 0;
    __syncthreads();

    thread_value = dev_values_arr[thread_id];
    atomicAdd(&temp[thread_value], 1);
    __syncthreads();

    atomicAdd(&(dev_histogram[threadIdx.x + 1]), temp[threadIdx.x + 1]);
}

int* histogram_cuda(int* values_arr, int size) {

    int num_blocks = size / BLOCK_SIZE;
    int* dev_histogram = 0;
    int* dev_values_arr = 0;
    int* histogram = (int*)malloc((BLOCK_SIZE + 1) * sizeof(int));

    cudaError_t cudaStatus;

    if (size % BLOCK_SIZE != 0) {
        num_blocks = num_blocks + 1;
    }

    // allocate histogram and values_arr device memories
    cudaStatus = cudaMalloc((void**)&dev_histogram,
        (BLOCK_SIZE + 1) * sizeof(int));

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    cudaStatus = cudaMemset(dev_histogram, 0, (BLOCK_SIZE + 1) * sizeof(int));

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMemset() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    cudaStatus = cudaMalloc((void**)&dev_values_arr, size * sizeof(int));

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    // copy values_arr memory in host to device
    cudaStatus = cudaMemcpy(dev_values_arr, values_arr, size * sizeof(int),
        cudaMemcpyHostToDevice);

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }


    printf("the number of blocks is %d\n\n", num_blocks);

    // calculate histogram on the gpu
    kernel_histogram << <num_blocks, BLOCK_SIZE >> > (dev_histogram, dev_values_arr,
        size);

    // copy histogram memory in device to host
    cudaStatus = cudaMemcpy(histogram, dev_histogram,
        (BLOCK_SIZE + 1) * sizeof(int), cudaMemcpyDeviceToHost);

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    // free device memory
    cudaFree(dev_histogram);
    cudaFree(dev_values_arr);

    return histogram;
}

int main(int argc, char* argv[]) {

    unsigned int size = 21;
    int* histogram;
    int values_arr[] = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 };

    histogram = histogram_cuda(values_arr, size);

    for (int i = 1; i < BLOCK_SIZE + 1; i++) {
        if (histogram[i] > 0) {
            printf("%d : %d\n", i, histogram[i]);
        }
    }
}

The histogram is meant to record the number of values present in the input, with the allowed values being 1 to 256. Each block is to have a maximum of 256 threads. I am trying to limit the number of overall threads across the blocks to so that each threads records the occurrence of one value in the histogram.

if I use "values_arr = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 }" which means the size is 21, I get:

2 : 7 4 : 1 5 : 4 7 : 4 19 : 1 20 : 1 21 : 1

I am trying to make it so that each value is recorded by one thread and all useless threads are disposed of. Also, any other problems you spot and any suggestions to make this in the best possible way would be appreciated. Thanks!

talonmies
  • 70,661
  • 34
  • 192
  • 269
RoyAbr121
  • 11
  • 1
  • 2
  • I would also like to add that the values are between 1 and 256. There can be only 256 threads per block and if the list of values is bigger than 256 we simply add blocks as needed. – RoyAbr121 Oct 25 '20 at 12:58
  • 1
    You have illegal use of `__syncthreads()` in your kernel code. The reason you are not getting the full output you expect is because you have allowed threads whose index is greater than `size` to `return` at the top of your kernel, without doing anything. Now consider if this makes sense for the last line of your kernel, where you need to update values in bins 100 and 256, both of which are larger than `size` – Robert Crovella Oct 25 '20 at 20:31
  • the kernel in your question now is different and broken from what you posted originally. Why? – talonmies Oct 25 '20 at 23:14

1 Answers1

1

In the new version of the code in your question, you have two conditionally executed __syncthreads() calls, which are illegal in CUDA and prone to either deadlock or produce undefined behaviour, depending on the hardware you have and the use case.

If I modify the kernel like this:

__global__ void kernel_histogram(int* dev_histogram, int* dev_values_arr, unsigned int size) {

    __shared__ int temp[BLOCK_SIZE + 1];
    int thread_id, thread_value;

    thread_id = threadIdx.x + blockIdx.x * blockDim.x;

    temp[threadIdx.x + 1] = 0;
    // Synchronization is unconditional
    __syncthreads();

    // Load is performed conditionally
    if (thread_id < size) {
        thread_value = dev_values_arr[thread_id];
        atomicAdd(&temp[thread_value], 1);
    }

    // Synchronization is unconditional
    __syncthreads();

    atomicAdd(&(dev_histogram[threadIdx.x + 1]), temp[threadIdx.x + 1]);
}

I get this output:

the number of blocks is 1

2 : 7
4 : 1
5 : 4
7 : 4
19 : 1
20 : 1
21 : 1
100 : 1
256 : 1

This looks much more like what is expected to my eyes.

talonmies
  • 70,661
  • 34
  • 192
  • 269