I'm writing a code that integrates a PDE in time in Fourier space, and I'm doing so in CUDA/C++.
There is one real valued array I need to evolve in time.
I've written the code in two different ways, but following the exact same logic.
In one version I simply define all all arrays as complex valued, and do complex to complex transforms. I make sure that I keep the one that is meant to be real valued being so by setting it's imaginary part to 0 every time step (to account for small errors accumulating in its imaginary part).
float *real;
cufftComplex *fourier;
...
for (int t = 0; t < totalTime; t++)
{
cufftEcexC2R(plan1, fourier, real);
// do some calculations
cufftExecR2C(plan2, real, fourier);
integrate<<<blocks, threads>>>(fourier, other parameters);
}
In the other version I have an array of floats for the real valued array, and a complex one for its fourier transform, and do complex-to-real and real-to-complex transforms.
cufftComplex *real;
cufftComplex *fourier;
...
for (int t = 0; t < totalTime; t++)
{
cufftEcexC2C(plan, fourier, real, CUFFT_INVERSE);
// do some calculations
cufftExecC2C(plan, real, fourier, CUFFT_FORWARD); // only need one plan in this case
integrate<<<blocks, threads>>>(fourier, other parameters);
}
I was under the impression that this second choice would be faster and more memory efficient. I understand it's more memory efficient because both arrays are half the size, one is real instead of complex, and the complex one only stores N/2+1 values. I have observed however that the first option is consistently, in my case and with my hardware, about 20% faster. (See minimal reproducible example below)
Is this expected to be so? Are complex-to-complex transforms faster computationally than real-to-complex or complex-to-real?
Minimal reproducible example. The following two codes are fully compilable. The one involving the real to complex and viceversa, in my hardware, takes about 1.1 to 1.2 second in average. The one involving complex to complex takes about 0.9 in average. The actual program runs for very long so this small difference is very noticeable in the larger program.
The helper_cuda.h
and helper_functions.h
are not necessary to compile them, although they can be found in the Common
directory of the samples that come with the cuda toolkit.
This one uses only complex-to-complex
#include <iostream>
#include <fstream>
#include <math.h>
#include <random>
#include <fftw3.h>
#include <complex.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_cuda.h>
#include <helper_functions.h>
using namespace std;
void printProgressbar(int t, int Nsteps, int length)
{
int progress = t*100/Nsteps;
int sprogress = t*length/Nsteps;
int left = length - sprogress - 1;
cout << "Progress [";
for (int i = 0; i < sprogress; i++)
cout << "=";
cout << ">";
for (int i = 0; i < left; i++)
cout << " ";
cout << "] " << progress << "% \r";
cout.flush();
}
int main()
{
int N = 128;
int Nsteps = 50000;
int numberofsnaps = 5;
int plot_steps = Nsteps/numberofsnaps;
cufftComplex *phi_host = new cufftComplex[N*N];
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<> dis(-0.01,0.01);
for (int i = 0; i < N*N; i++)
{
phi_host[i].x = (float)dis(gen);
phi_host[i].y = 0.0f;
}
// Device arrays
cufftComplex *phi_real;
cufftComplex *phi_comp;
cudaMalloc(reinterpret_cast<void **>(&phi_real), N * N * sizeof(cufftComplex));
cudaMalloc(reinterpret_cast<void **>(&phi_comp), N * N * sizeof(cufftComplex));
cufftHandle plan_c2c;
cufftPlan2d(&plan_c2c, N, N, CUFFT_C2C);
cudaMemcpy(phi_real, phi_host, N * N * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cufftExecC2C(plan_c2c, phi_real, phi_comp, CUFFT_FORWARD);
for (int t = 0; t < Nsteps; t++)
{
// Output data
if (t % plot_steps == 0)
{
printProgressbar(t,Nsteps,25);
cudaMemcpy(phi_host, phi_real, N * N * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
}
cufftExecC2C(plan_c2c, phi_comp, phi_real, CUFFT_INVERSE);
cufftExecC2C(plan_c2c, phi_real, phi_comp, CUFFT_FORWARD);
}
cout << "Progress [========================>] 100%" << endl;
return 0;
}
This one uses real-to-complex and viceversa
#include <iostream>
#include <fstream>
#include <math.h>
#include <random>
#include <fftw3.h>
#include <complex.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <helper_cuda.h>
#include <helper_functions.h>
using namespace std;
void printProgressbar(int t, int Nsteps, int length)
{
int progress = t*100/Nsteps;
int sprogress = t*length/Nsteps;
int left = length - sprogress - 1;
cout << "Progress [";
for (int i = 0; i < sprogress; i++)
cout << "=";
cout << ">";
for (int i = 0; i < left; i++)
cout << " ";
cout << "] " << progress << "% \r";
cout.flush();
}
int main()
{
int N = 128;
int Np = N/2 + 1;
int Nsteps = 50000;
int numberofsnaps = 5;
int plot_steps = Nsteps/numberofsnaps;
float *phi_host = new float[N*N];
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<> dis(-0.01,0.01);
for (int i = 0; i < N*N; i++)
{
phi_host[i] = (float)dis(gen);
}
// Device arrays
float *phi_real;
cufftComplex *phi_comp;
cudaMalloc(reinterpret_cast<void **>(&phi_real), N * N * sizeof(float));
cudaMalloc(reinterpret_cast<void **>(&phi_comp), N * Np * sizeof(cufftComplex));
cufftHandle plan_r2c, plan_c2r;
cufftPlan2d(&plan_r2c, N, N, CUFFT_R2C);
cufftPlan2d(&plan_c2r, N, N, CUFFT_C2R);
cudaMemcpy(phi_real, phi_host, N * N * sizeof(float), cudaMemcpyHostToDevice);
cufftExecR2C(plan_r2c, phi_real, phi_comp);
for (int t = 0; t < Nsteps; t++)
{
// Output data
if (t % plot_steps == 0)
{
printProgressbar(t,Nsteps,25);
cudaMemcpy(phi_host, phi_real, N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
cufftExecC2R(plan_c2r, phi_comp, phi_real);
cufftExecR2C(plan_r2c, phi_real, phi_comp);
}
cout << "Progress [========================>] 100%" << endl;
return 0;
}