I'm currently refactoring my CUDA code and stumbled across a problem: Passing multiple device pointers to the kernel can become quite ugly and, therefore, I thought of grouping them together inside a struct of arrays like so:
struct Data {
float* a;
float* b;
float* c;
// ...
};
__global__ void foo(Data* data, size_t n) {
size_t const id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
data->c[id] = data->a[id] + data->b[id];
}
}
Now consider this host code:
struct Data host_data;
struct Data device_data;
// allocate host memory
size_t const kSize = 10;
host_data.a = (float*)malloc(sizeof(float) * kSize);
host_data.b = (float*)malloc(sizeof(float) * kSize);
host_data.c = (float*)malloc(sizeof(float) * kSize);
// allocate device memory
cudaMalloc((void **)&device_data.a, sizeof(float) * kSize);
cudaMalloc((void **)&device_data.b, sizeof(float) * kSize);
cudaMalloc((void **)&device_data.c, sizeof(float) * kSize);
// initialize some host data
for (size_t i = 0; i < kSize; ++i) {
host_data.a[i] = 10.f;
host_data.b[i] = 10.f;
}
// copy host data to device data
cudaMemcpy(device_data.a, host_data.a, sizeof(float) * kSize, cudaMemcpyHostToDevice);
cudaMemcpy(device_data.b, host_data.b, sizeof(float) * kSize, cudaMemcpyHostToDevice);
cudaMemcpy(device_data.c, host_data.c, sizeof(float) * kSize, cudaMemcpyHostToDevice);
// make sure to allocate the device_ptr struct itself on the device memory
struct Data* device_ptr;
cudaMalloc((void**)&device_ptr, sizeof(*device_ptr));
// bind local device pointers
cudaMemcpy(&(device_ptr->a), &device_data.a, sizeof(device_ptr->a), cudaMemcpyHostToDevice);
cudaMemcpy(&(device_ptr->b), &device_data.b, sizeof(device_ptr->b), cudaMemcpyHostToDevice);
cudaMemcpy(&(device_ptr->c), &device_data.c, sizeof(device_ptr->c), cudaMemcpyHostToDevice);
// launch kernel
size_t const block_size = 512;
size_t const grid_size = ceil((float)kSize/block_size);
simple <<< grid_size, block_size >>> (device_ptr, kSize);
// wait for kernel to finish
cudaDeviceSynchronize();
// copy device data back to host data
cudaMemcpy(host_data.a, device_data.a, sizeof(float) * kSize, cudaMemcpyDeviceToHost);
cudaMemcpy(host_data.b, device_data.b, sizeof(float) * kSize, cudaMemcpyDeviceToHost);
cudaMemcpy(host_data.c, device_data.c, sizeof(float) * kSize, cudaMemcpyDeviceToHost);
This of course works, but it seems counterintuitive, as sharing data structures between the host and device becomes extremely tedious. Is there a way to avoid / reduce the amount of cudaMemcpy
and cudaMalloc
calls somehow?
FYI: I'm aware that passing a copy of the Data
struct to the kernel is possible. However, this is not very efficient for large structs.