Following host code test.c
and device code test0.cu
are intended to give the same result.
test.c
$ cat test.c
#include <stdio.h>
#include <string.h>
int main()
{
int data[32];
int dummy[32];
for (int i = 0; i < 32; i++)
data[i] = i;
memcpy(dummy, data, sizeof(data));
for (int i = 1; i < 32; i++)
data[i] += dummy[i - 1];
memcpy(dummy, data, sizeof(data));
for (int i = 2; i < 32; i++)
data[i] += dummy[i - 2];
memcpy(dummy, data, sizeof(data));
for (int i = 4; i < 32; i++)
data[i] += dummy[i - 4];
memcpy(dummy, data, sizeof(data));
for (int i = 8; i < 32; i++)
data[i] += dummy[i - 8];
memcpy(dummy, data, sizeof(data));
for (int i = 16; i < 32; i++)
data[i] += dummy[i - 16];
printf("kernel : ");
for (int i = 0; i < 32; i++)
printf("%4i ", data[i]);
printf("\n");
}
$
test0.cu
$ cat test0.cu
#include <stdio.h>
__global__ void kernel0(int *data)
{
size_t t_id = threadIdx.x;
if (1 <= t_id)
data[t_id] += data[t_id - 1];
if (2 <= t_id)
data[t_id] += data[t_id - 2];
if (4 <= t_id)
data[t_id] += data[t_id - 4];
if (8 <= t_id)
data[t_id] += data[t_id - 8];
if (16 <= t_id)
data[t_id] += data[t_id - 16];
}
int main()
{
int data[32];
int result[32];
int *data_d;
cudaMalloc(&data_d, sizeof(data));
for (int i = 0; i < 32; i++)
data[i] = i;
dim3 gridDim(1);
dim3 blockDim(32);
cudaMemcpy(data_d, data, sizeof(data), cudaMemcpyHostToDevice);
kernel0<<<gridDim, blockDim>>>(data_d);
cudaMemcpy(result, data_d, sizeof(data), cudaMemcpyDeviceToHost);
printf("kernel0 : ");
for (int i = 0; i < 32; i++)
printf("%4i ", result[i]);
printf("\n");
}
$
If I compile and run them, they do give the same result as I expected.
$ gcc -o test test.c
$ ./test
kernel : 0 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153 171 190 210 231 253 276 300 325 351 378 406 435 465 496
$ nvcc -o test_dev0 test0.cu
$ ./test_dev0
kernel0 : 0 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153 171 190 210 231 253 276 300 325 351 378 406 435 465 496
$
However, if I use shared memory instead of global memory in the device code, as in test1.cu
, it gives different result.
test1.cu
$ cat test1.cu
#include <stdio.h>
__global__ void kernel1(int *data)
{
__shared__ int data_s[32];
size_t t_id = threadIdx.x;
data_s[t_id] = data[t_id];
if (1 <= t_id)
data_s[t_id] += data_s[t_id - 1];
if (2 <= t_id)
data_s[t_id] += data_s[t_id - 2];
if (4 <= t_id)
data_s[t_id] += data_s[t_id - 4];
if (8 <= t_id)
data_s[t_id] += data_s[t_id - 8];
if (16 <= t_id)
data_s[t_id] += data_s[t_id - 16];
data[t_id] = data_s[t_id];
}
int main()
{
int data[32];
int result[32];
int *data_d;
cudaMalloc(&data_d, sizeof(data));
for (int i = 0; i < 32; i++)
data[i] = i;
dim3 gridDim(1);
dim3 blockDim(32);
cudaMemcpy(data_d, data, sizeof(data), cudaMemcpyHostToDevice);
kernel1<<<gridDim, blockDim>>>(data_d);
cudaMemcpy(result, data_d, sizeof(data), cudaMemcpyDeviceToHost);
printf("kernel1 : ");
for (int i = 0; i < 32; i++)
printf("%4i ", result[i]);
printf("\n");
}
$
If I compile test1.cu
and run it, it gives different result from test0.cu
or test.c
.
$ nvcc -o test_dev1 test1.cu
$ ./test_dev1
kernel1 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
$
Is warp synchronization not supposed to work with shared memory?
Some investigation into this issue:
When using CUDA8.0, if I compile test1.cu
with -arch=sm_61
option(I'm testing with GTX 1080), it gives same result as test0.cu
and test.c
.
$ nvcc -o test_dev1_arch -arch=sm_61 test1.cu
$ ./test_dev1_arch
kernel1 : 0 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153 171 190 210 231 253 276 300 325 351 378 406 435 465 496
$
But this does not apply to newer versions of CUDA. If I use any newer version than 8.0, the test result is different even if I give the -arch=sm_61
option.