I am implementing a circular global memory to enable all threads read/write data to the same buffer simultaneously. It is a very simple producer/consumer algorithm in cpu. But i found something wrong in my cuda code. The circular buffer was defined as follows:
#define BLOCK_NUM 1024
#define THREAD_NUM 64
#define BUFFER_SIZE BLOCK_NUM*THREAD_NUM*10
struct Stack {
bool bDirty[BUFFER_SIZE];
unsigned int index;
unsigned int iStackSize;
}
The read device is implemented as
__device__ void read(Stack *pStack) {
unsigned int index = atomicDec(&pStack->index, BUFFER_SIZE-1);
if(- -index >= BUFFER_SIZE)
index = BUFFER_SIZE - 1;
// check
if(pStack->bDirty[index] == false) {
printf(“no data\n”);
return;
}
//set read flag
pStack->bDirty[index] = false;
atomicSub(&pStack->iStackSize, 1);
}
The write device function is:
__device__ void write(Stack *pStack) {
unsigned int index = atomicInc(&pStack->index, BUFFER_SIZE - 1);
//check
if(pStack->bDirty[index] == true) {
printf(“why dirty\n”);
return;
}
pStack->bDirty[index] = true;
atomicAdd(&pStack->iStackSize, 1);
}
In order to test the read/write function in a more robust way, I write the following kernels:
__global__ void kernelWrite(Stack *pStack) {
if(threadIdx.x != 0) //make write less than thread number for testing purpose
write(pStack);
}
__global__ void kernelRead(Stack *pStack) {
read(pStack);
__syncthreads();
if(threadIdx.x % 3 != 0) // make write less than read
write(pStack);
__syncthreads();
}
In the main function, I used a dead loop to test if the read/write is atomic.
int main() {
Stack *pHostStack = (Stack*)malloc(sizeof(Stack));
Stack *pStack;
cudaMalloc(&pStack, sizeof(Stack));
cudaMemset(pStack, 0, sizeof(Stack));
while(true) { //dead loop
kernelWrite<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
while(pHost->iStackSize >= BLOCK_NUM*THREAD_NUM) {
kernelRead<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
}
return 0;
}
When I execute the above code, I got error msg “why dirty” and “no data”. What is wrong to the read/write logic?
By the way, I do not map the thread ID to the linear buffer address because in my application maybe only 10% threads write to the buffer, it is unpredictable/random.