cudamemcpyasync, memcpy fails to copy inside kernel while direct copying works

Question

I am trying to copy from a source float array(containing 1.0f) to a destination float array(containing 2.0f) inside a cuda kernel. I try three different ways using:

cudamemcpysync
memcpy
direct copy (dst[i] = src[i])

When i read the results after the kernel has been executed I found that both cudamemcpyasync and memcpy has failed to copy while the direct copy method has worked.

Why has the cudamemcpyasync and memcpy method failed?

I am using GTX TitanX(SM_52).

compiled using: nvcc -arch=compute_52 main.cu

main.cu:

#include <stdio.h>
#include <iostream>


__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;

    if(idx < size){
//        memcpy(dst +idx*sizeof(float), src + idx*sizeof(float), 1); // FAILS TO COPY
//        cudaMemcpyAsync(dst +idx*sizeof(float), src + idx*sizeof(float), 1, cudaMemcpyDeviceToDevice); // FAILS TO COPY
//          dst[idx] = src[idx]; // COPIES SUCCESSFULLY
    }

}

int current = 0;
int UniqueNumber () { return ++current; }

int main(void)
{
  int N = 1000;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));



  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);


//  cudamemcpy_inside_kernel<<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudamemcpy_inside_kernel<<<2, 512>>>(d_x, d_y, N);
  cudaDeviceSynchronize();

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]); // y[i] should have all 1.0f

}

Both the memcpy and cudaMemcpyAsync have incorrect arguments, that is why they don't work correctly — talonmies, Sep 10 '19 at 07:58

score 4 · Accepted Answer · answered Sep 10 '19 at 12:02

You have the source, destination, and size arguments wrong in both memcpy calls. Something like this:

#include <stdio.h>
#include <iostream>

template<int action>
__global__
void cudamemcpy_inside_kernel(float *src, float *dst, int size)
{
  int idx = blockIdx.x*blockDim.x + threadIdx.x;
  if(idx < size)
    switch(action) {
      case 1:
        memcpy(dst+idx, src+idx, sizeof(float));
        break;
      case 2:
        cudaMemcpyAsync(dst+idx, src+idx, sizeof(float), cudaMemcpyDeviceToDevice);
        break;
      default:
        dst[idx] = src[idx];
    }
}

int main(void)
{
  int N = 10;

  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));


  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);

  printf("Assignment \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<0><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n Memcpy \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<1><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  printf("\n cudaMemcpyAsync \n");
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
  cudamemcpy_inside_kernel<2><<<(N+255)/256, 256>>>(d_x, d_y, N);
  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < N; i++)
     printf(" %f\n", y[i]);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

will work as you expect:

$ nvcc -arch=sm_52 -dc -o memcpy_kernel.o memcpy_kernel.cu
$ nvcc -arch=sm_52 -o memcpy_kernel memcpy_kernel.o
$ ./memcpy_kernel 
Assignment 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 Memcpy 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

 cudaMemcpyAsync 
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000
 1.000000

cudamemcpyasync, memcpy fails to copy inside kernel while direct copying works

1 Answers1