I would like to use two GPUs to execute a kernel then execute a single FFT using cufftXt. The data could be several GBs in size. My understanding of allocating memory for kernels on 2 GPUs is that you should split the host array in half and send the first half to GPU0 and the other half to GPU1. The following example shows how this could be done.
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<real_size){
float x = (i % (N+2));
if(x < N){
data3[i] = pow(data[i], 3.0f);
}
else{
data3[i] = 0.0f;
}
}
__syncthreads();
}
int main (int argc, char **argv) {
int x;
int N = 8;
int cplx_size = N * (N/2 + 1);
int real_size = 2 * cplx_size;
int mem_size = sizeof(cufftReal)*real_size;
int half_real_size = real_size/2;
int half_mem_size = mem_size/2;
cufftReal *h_data = (cufftReal*)malloc(mem_size);
cufftReal *h_data3 = (cufftReal*)malloc(mem_size);
cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);
for(int i=0; i<real_size; i++){
x = (i % (N+2));
if(x < N){h_data[i] = 2;}
else{h_data[i] = 0;}
}
for(int i=0; i<half_real_size; i++){
h0_data[i] = h_data[i];
h1_data[i] = h_data[i+half_real_size];
}
cufftReal *d0_data;
cufftReal *d0_data3;
cufftReal *d1_data;
cufftReal *d1_data3;
cudaSetDevice(0);
gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
cudaSetDevice(1);
gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));
cout <<"device memory allocated" <<endl;
int maxThreads=(N>1024)?1024:N;
int threadsPerBlock = maxThreads;
int numBlocks = (half_real_size)/threadsPerBlock;
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));
cout <<"mem copied to devices" <<endl;
cudaSetDevice(0);
Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(1);
Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaSetDevice(0);
gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
cudaSetDevice(1);
gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h0_data3[i] <<" ";
}
cout <<endl;
for(int i = 0; i<half_real_size; i++){
cout <<h1_data3[i] <<" ";
}
//clean up
cudaFree(d0_data);
cudaFree(d0_data3);
cudaFree(d1_data);
cudaFree(d1_data3);
return 0;
}
However, I do not see how this approach is compatible with cufftXt. It appears that I should use the helper function cufftXtMemcpy to automatically split up the data onto the devices. But if I do that, then the multi-gpu kernel method shown above is not useable unless I allocate separate device memory for cufftXt and kernels. Is there any way to run both cufftXt and kernels without doubly allocating device memory?