I have an issue with a simple CUDA code to produce a histogram:
__#include <math.h>
#include <numeric>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
__global__ void kernel_histogram(int* dev_histogram, int* dev_values_arr, unsigned int size) {
__shared__ int temp[BLOCK_SIZE + 1];
int thread_id, thread_value;
thread_id = threadIdx.x + blockIdx.x * blockDim.x;
if (thread_id >= size) {
return;
}
temp[threadIdx.x + 1] = 0;
__syncthreads();
thread_value = dev_values_arr[thread_id];
atomicAdd(&temp[thread_value], 1);
__syncthreads();
atomicAdd(&(dev_histogram[threadIdx.x + 1]), temp[threadIdx.x + 1]);
}
int* histogram_cuda(int* values_arr, int size) {
int num_blocks = size / BLOCK_SIZE;
int* dev_histogram = 0;
int* dev_values_arr = 0;
int* histogram = (int*)malloc((BLOCK_SIZE + 1) * sizeof(int));
cudaError_t cudaStatus;
if (size % BLOCK_SIZE != 0) {
num_blocks = num_blocks + 1;
}
// allocate histogram and values_arr device memories
cudaStatus = cudaMalloc((void**)&dev_histogram,
(BLOCK_SIZE + 1) * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
cudaStatus = cudaMemset(dev_histogram, 0, (BLOCK_SIZE + 1) * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMemset() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
cudaStatus = cudaMalloc((void**)&dev_values_arr, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
// copy values_arr memory in host to device
cudaStatus = cudaMemcpy(dev_values_arr, values_arr, size * sizeof(int),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
printf("the number of blocks is %d\n\n", num_blocks);
// calculate histogram on the gpu
kernel_histogram << <num_blocks, BLOCK_SIZE >> > (dev_histogram, dev_values_arr,
size);
// copy histogram memory in device to host
cudaStatus = cudaMemcpy(histogram, dev_histogram,
(BLOCK_SIZE + 1) * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
cudaGetErrorString(cudaStatus));
exit(-1);
}
// free device memory
cudaFree(dev_histogram);
cudaFree(dev_values_arr);
return histogram;
}
int main(int argc, char* argv[]) {
unsigned int size = 21;
int* histogram;
int values_arr[] = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 };
histogram = histogram_cuda(values_arr, size);
for (int i = 1; i < BLOCK_SIZE + 1; i++) {
if (histogram[i] > 0) {
printf("%d : %d\n", i, histogram[i]);
}
}
}
The histogram is meant to record the number of values present in the input, with the allowed values being 1 to 256. Each block is to have a maximum of 256 threads. I am trying to limit the number of overall threads across the blocks to so that each threads records the occurrence of one value in the histogram.
if I use "values_arr = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 }" which means the size is 21, I get:
2 : 7 4 : 1 5 : 4 7 : 4 19 : 1 20 : 1 21 : 1
I am trying to make it so that each value is recorded by one thread and all useless threads are disposed of. Also, any other problems you spot and any suggestions to make this in the best possible way would be appreciated. Thanks!