0

I have a class such as this:

class CudaArray
{
   CudaArray() : Ptr(new double[5]) {}
   double* Ptr;
   int Dimension;
}

and then another class such as this:

class Container
{
   short a;
   CudaArray* ArrayPtr;
   int b;
   int c;
}

Right now I m creating the Array on device in this way:

CudaArray H_Array;
CudaArray* D_Array;

Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));

double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));

I want to be able to use an object of type Container on device code, and I m having trouble initializing the CudaArray member from an existing array. So far i tried:

Container* Cont = nullptr;

Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));

But I get GPUassert: invalid argument on the last cudaMemcpy.

How can I initialize a device class that contains a pointer to an existing object(class) in device memory?

And also, is there a simpler or more elegant way to copy complex objects between host and device?

AathakA
  • 117
  • 7
  • 1
    This is a frequently asked question. There are many questions already on the SO `cuda` tag that explain how to copy classes with embedded pointers between host and device. – Robert Crovella Dec 03 '21 at 14:55
  • I read several of your answers but sadly they didn't quite cover what I m looking for here... – AathakA Dec 03 '21 at 15:15
  • Put simply, you can’t use a class like your CudaArray with memory allocation in the constructor via new – talonmies Dec 03 '21 at 17:29

1 Answers1

1
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));

But I get GPUassert: invalid argument on the last cudaMemcpy.

The error is that you are specifying cudaMemcpyDeviceToDevice but &D_Array is a location in host memory:

CudaArray* D_Array;

You should be using cudaMempcyHostToDevice.

How can I initialize a device class that contains a pointer to an existing object(class) in device memory?

That change seems to fix it for me:

$ cat t174.cu
#include <cstdio>
class CudaArray
{
  public:
   CudaArray() : Ptr(new double[5]) {}
   double* Ptr;
   int Dimension;
};

class Container
{
  public:
   short a;
   CudaArray* ArrayPtr;
   int b;
   int c;
};
#define Check(x) x
__global__ void k(Container *c){

  printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){

CudaArray H_Array;
H_Array.Ptr[0] = 1234.0;
CudaArray* D_Array;

Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));

double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));

Container* Cont = nullptr;

Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyHostToDevice));
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t174 t174.cu
$ cuda-memcheck ./t174
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$

And also, is there a simpler or more elegant way to copy complex objects between host and device?

There might be performance impacts, but from a code complexity standpoint, if you make all allocations via a managed allocator, things may be simpler (simplicity is in the eye of the beholder):

$ cat t175.cu
#include <cstdio>
#include <new>
class CudaArray
{
  public:
   CudaArray()  {cudaMallocManaged(&Ptr, 5*sizeof(double)); for (int i = 0; i < 5; i ++) Ptr[i] = 0.0;}
   double* Ptr;
   int Dimension;
};

class Container
{
  public:
   short a;
   CudaArray* ArrayPtr;
   int b;
   int c;
};
#define Check(x) x
__global__ void k(Container *c){

  printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){

CudaArray *my_Array;
cudaMallocManaged(&my_Array, sizeof(CudaArray));
new(my_Array) CudaArray();
my_Array[0].Ptr[0] = 1234.0;

Container* Cont = nullptr;

Check(cudaMallocManaged(&Cont , sizeof(Container)));
Cont[0].ArrayPtr = my_Array;
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t175 t175.cu
$ cuda-memcheck ./t175
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257