I am trying to copy from a source float array(containing 1.0f) to a destination float array(containing 2.0f) inside a cuda kernel. I try three different ways using:
- cudamemcpysync
- memcpy
- direct copy (dst[i] = src[i])
When i read the results after the kernel has been executed I found that both cudamemcpyasync and memcpy has failed to copy while the direct copy method has worked.
Why has the cudamemcpyasync and memcpy method failed?
I am using GTX TitanX(SM_52).
compiled using: nvcc -arch=compute_52 main.cu
main.cu:
#include <stdio.h>
#include <iostream>
__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < size){
// memcpy(dst +idx*sizeof(float), src + idx*sizeof(float), 1); // FAILS TO COPY
// cudaMemcpyAsync(dst +idx*sizeof(float), src + idx*sizeof(float), 1, cudaMemcpyDeviceToDevice); // FAILS TO COPY
// dst[idx] = src[idx]; // COPIES SUCCESSFULLY
}
}
int current = 0;
int UniqueNumber () { return ++current; }
int main(void)
{
int N = 1000;
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
// cudamemcpy_inside_kernel<<<(N+255)/256, 256>>>(d_x, d_y, N);
cudamemcpy_inside_kernel<<<2, 512>>>(d_x, d_y, N);
cudaDeviceSynchronize();
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int i = 0; i < N; i++)
printf(" %f\n", y[i]); // y[i] should have all 1.0f
}