-2

I am implementing a circular global memory to enable all threads read/write data to the same buffer simultaneously. It is a very simple producer/consumer algorithm in cpu. But i found something wrong in my cuda code. The circular buffer was defined as follows:

#define BLOCK_NUM 1024
#define THREAD_NUM 64
#define BUFFER_SIZE BLOCK_NUM*THREAD_NUM*10
struct Stack {
    bool bDirty[BUFFER_SIZE];
    unsigned int index;
    unsigned int iStackSize;
}

The read device is implemented as

__device__ void read(Stack *pStack) {
    unsigned int index = atomicDec(&pStack->index, BUFFER_SIZE-1);
    if(- -index >= BUFFER_SIZE)
        index = BUFFER_SIZE - 1;
    // check
    if(pStack->bDirty[index] == false) {
        printf(“no data\n”);
        return;
    }
    //set read flag
    pStack->bDirty[index] = false;
    atomicSub(&pStack->iStackSize, 1);
}

The write device function is:

__device__ void write(Stack *pStack) {
    unsigned int index = atomicInc(&pStack->index, BUFFER_SIZE - 1);
    //check
    if(pStack->bDirty[index] == true) {
        printf(“why dirty\n”);
        return;
    }
    pStack->bDirty[index] = true;
    atomicAdd(&pStack->iStackSize, 1);
}

In order to test the read/write function in a more robust way, I write the following kernels:

__global__ void kernelWrite(Stack *pStack) {
    if(threadIdx.x != 0) //make write less than thread number for testing purpose
        write(pStack);
}

__global__ void kernelRead(Stack *pStack) {
    read(pStack);
    __syncthreads();
    if(threadIdx.x % 3 != 0) // make write less than read
        write(pStack);
    __syncthreads();
}

In the main function, I used a dead loop to test if the read/write is atomic.

int main() {
    Stack *pHostStack = (Stack*)malloc(sizeof(Stack));
    Stack *pStack;
    cudaMalloc(&pStack, sizeof(Stack));
    cudaMemset(pStack, 0, sizeof(Stack));

    while(true) { //dead loop
        kernelWrite<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
        cudaDeviceSynchonize();
        cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
        while(pHost->iStackSize >= BLOCK_NUM*THREAD_NUM) {
             kernelRead<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
                   cudaDeviceSynchonize();
                   cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
         }
    return 0;
}

When I execute the above code, I got error msg “why dirty” and “no data”. What is wrong to the read/write logic?

By the way, I do not map the thread ID to the linear buffer address because in my application maybe only 10% threads write to the buffer, it is unpredictable/random.

Jannus YU
  • 89
  • 6
  • 1
    I don't see any code for initializing the contents of `pStack` before you use it. Does it exist? It would be much simpler if you posted a proper [MCVE] – talonmies Nov 13 '17 at 18:08
  • Sorry, i forgot to memset it because the code was uploaded by my phone but not from the pc. – Jannus YU Nov 13 '17 at 23:13
  • @talonmies Yes the memset exsits in my runable programe. This demo is complete except some header includes and – Jannus YU Nov 14 '17 at 01:24

1 Answers1

-2

The key problem is that the atomic operation is not real atomic because of reading and writing to the same buffer. The weird thing is that when the total thread number is less then 4096, no error message will be shown.

Jannus YU
  • 89
  • 6