I want to copy the results of a fft operation from device to host.
This is what happens. The input is a pointer to a pointer to an float. the values get allocated during runtime. then it is transferred to the gpu and fft is calculated. Then the results are transferred to to float2 2D array. But the result i get is wrong. It contains all zero. So how can I overcome this issue ?
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iostream>
#include <vector>
using namespace std;
float func(int,int){
return 2.0f; // some value get return. I have put a dummy value here
}
int main()
{
const size_t NX = 4;
const size_t NY = 5;
// Input array - host side
float **a = new float*[NX];
for (int r = 0; r < NX; ++r) // this can be also done on GPU
{
a[r] = new float[NY];
for (int c = 0; c < NY; ++c)
{
a[r][c] = func(r,c);
}
}
// Output array - host side
float2 c[NX][NY] = { 0 };
cufftHandle plan;
cufftComplex *data; // Input and output arrays - device side
int n[NRANK] = {NX, NY};
// Transfer the data from host to device - have to do it like this becase
// the array is a dynamic array
cudaMalloc((void**)&data, sizeof(cufftComplex)*NX*(NY/2+1));
for(int i=0; i<NX; ++i){
cudaMemcpy(reinterpret_cast<float*>(data) + i*NY, a[i], sizeof(float)*NY,
cudaMemcpyHostToDevice);
}
/* Create a 2D FFT plan. */
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_C2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecC2C(plan, data, data, CUFFT_FORWARD);
cudaThreadSynchronize();
cudaMemcpy(c, data, sizeof(float2)*NX*NY, cudaMemcpyDeviceToHost);
// Print the values of c ---- ALL ARE 0
for (int i = 0; i < NX; i++)
{
for (int j =0 ; j< NY; j++)
{
printf(" %f + %fi ",c[i][j].x,c[i][j].y);
b
}
printf("\n");
}
cufftDestroy(plan);
cudaFree(data);
return 0;
}
How could I solve this problem ?
EDIT
After considering Robert Crovella's suggestion I modified the code as
// Output array - host side
float2 c[NX][NY + 2] ;
// New device side variable that will hold the result from the FFT size - twice as input {2 x NX*(NY/2 + 1)}
cufftComplex *data_out;
cudaMalloc((void**)&data_out, sizeof(cufftComplex)*NX*(NY+2));
/* Create a 2D FFT plan. */
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_C2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecC2C(plan, data, data_out, CUFFT_FORWARD);
cudaThreadSynchronize();
cudaError cudaStat2 = cudaMemcpy(c, data_out, sizeof(cufftComplex)*NX*(NY+2) , cudaMemcpyDeviceToHost);
cout << cudaGetErrorString(cudaStat2) << " ,\n";
for (int i = 0; i < NX; i++)
{
for (int j =0 ; j< NY; j++)
{
printf(" %f ,",c[i][j].x);
}
printf("\n");
}
Now the output device matrix is 2 x sizeof(cufftComplex)NX(NY/2+1) and I have declared it as data_out. Then the host side matrix was also adjusted to hold NX*(NY+2) elements of float2. Now I don't get any errors from cudaMemcpy. But still I don't get the answer. What i get is an array of 1.#QNAN0 values.
So how can I solve this ?