I'm trying to produce some FFT math, in particular it's do two 2D forward transforms, multiply them, and then make inverse transform. Before inverse transform everything goes great. I've already did it by fftw3, but in CuFFT something goes wrong. Most of the values are similar, but some are wrong, which is significant for the future math. What's the problem with this piece of code?
std::vector<complex> conv2dCUDA(complex *ui_anomaly, double *ds2,
complex *u0, int anx, int any, double factor) {
cufftComplex *b1, *b2;
int size = 2 * anx * 2 * any;
int memsize = size * sizeof(cufftComplex);
b1 = (cufftComplex *)calloc(size, sizeof(cufftComplex));
b2 = (cufftComplex *)calloc(size, sizeof(cufftComplex));
// filling the matrixes
cufftHandle plan;
cufftComplex *ui, *g;
checkCudaErrors(cudaMalloc((void**)&ui, memsize));
checkCudaErrors(cudaMalloc((void**)&g, memsize));
checkCudaErrors(cufftPlan2d(&plan, 2 * anx, 2 * any, CUFFT_C2C));
checkCudaErrors(cudaMemcpy(ui, (cufftComplex *)&b1[0], memsize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(g, (cufftComplex *)&b2[0], memsize, cudaMemcpyHostToDevice));
checkCudaErrors(cufftExecC2C(plan, ui, ui, CUFFT_FORWARD));
checkCudaErrors(cufftExecC2C(plan, g, g, CUFFT_FORWARD));
int blockSize = 16;
dim3 dimGrid(int(2 * any / blockSize) + 1, int(2 * anx / blockSize) + 1);
dim3 dimBlock(blockSize, blockSize);
ComplexMulAndScale<<<dimGrid, dimBlock>>>(ui, g, size, 1.0f);
getLastCudaError("Kernel execution went wrong");
checkCudaErrors(cudaMemcpy(b1, ui, memsize, cudaMemcpyDeviceToHost));
std::cout << "After mult Cuda" << std::endl;
for (auto i = 0; i < 2 * any; i++) {
for (auto j = 0; j < 2 * anx; j++) {
std::cout << b1[i * 2 * anx + j].x << " ";
}
std::cout << std::endl;
}
checkCudaErrors(cufftExecC2C(plan, ui, ui, CUFFT_INVERSE));
cuComplex *inversed;
inversed = (cuComplex*)malloc(memsize);
checkCudaErrors(cudaMemcpy(inversed, ui, memsize, cudaMemcpyDeviceToHost));
std::vector<complex> res(anx * any);
for (auto i = 0; i < any; i++) {
for (auto j = 0; j < anx; j++) {
res[i * anx + j] = complex(inversed[i * anx * 2 + j].x * factor, inversed[i * anx * 2 + j].y * factor);
}
}
std::cout << "CUDA" << std::endl;
for (auto i = 0; i < 2 * any; i++) {
for (auto j = 0; j < 2 * anx; j++) {
std::cout << inversed[i * 2 * anx + j].x << " ";
}
std::cout << std::endl;
}
checkCudaErrors(cudaFree(ui));
checkCudaErrors(cudaFree(g));
checkCudaErrors(cufftDestroy(plan));
free(b1);
free(b2);
free(inversed);
return res;
}
std::vector<complex> conv2d(complex *ui_anomaly, double *ds2, complex *u0, int anx, int any, double factor) {
std::vector<complex> b1(anx * 2 * 2 * any, complex(0., 0.)), b2(anx * 2 * 2 * any, complex(0., 0.));
// filling matrixes
// forward fft 1 in-place
fftw_plan p;
p = fftw_plan_dft_2d(2 * any, 2 * anx, (fftw_complex *) (&b1[0]), (fftw_complex *) (&b1[0]),
FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
fftw_destroy_plan(p);
// forward fft 2 in-place
p = fftw_plan_dft_2d(2 * any, 2 * anx, (fftw_complex *) (&b2[0]), (fftw_complex *) (&b2[0]),
FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
fftw_destroy_plan(p);
std::vector<complex> out(2 * anx * 2 * any, complex(0., 0.));
for (auto i = 0; i < 2 * any * 2 * anx; i++) {
out[i] = b1[i] * b2[i];
}
std::cout << "After mult fftw" << std::endl;
for (auto i = 0; i < 2 * any; i++) {
for (auto j = 0; j < 2 * anx; j++) {
std::cout << out[i * 2 * anx + j].real() << " ";
}
std::cout << std::endl;
}
// inverse fft in-place
p = fftw_plan_dft_2d(2 * (int) any, 2 * (int) anx, (fftw_complex *) (&out[0]), (fftw_complex *) (&out[0]),FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
fftw_destroy_plan(p);
std::vector<complex> res(anx * any);
for (auto i = 0; i < any; i++) {
for (auto j = 0; j < anx; j++) {
res[i * anx + j] = out[i * anx * 2 + j] * factor;
}
}
std::cout << "FFTW" << std::endl;
for (auto i = 0; i < 2 * any; i++) {
for (auto j = 0; j < 2 * anx; j++) {
std::cout << out[i * 2 * anx + j].real() << " ";
}
std::cout << std::endl;
}
return res;
}
So, this is my code. Output should be in both functions
After mult fftw
8.34304e-08 -5.99259e-07 -4.7876e-07 5.30254e-07 9.55877e-07 4.28985e-07
-1.56375e-07 1.19699e-07 2.39276e-07 -1.68662e-08 -7.56988e-08 -3.69897e-07
-2.66505e-07 -2.33361e-07 -5.21763e-07 -5.29126e-07 1.8915e-07 1.68158e-07
-9.01859e-07 -2.37453e-07 -3.50661e-08 -4.11154e-07 4.14802e-07 -7.9879e-07
2.09404e-07 6.52034e-08 1.8915e-07 4.97805e-07 3.32612e-07 -2.33361e-07
-1.95738e-07 -3.69897e-07 -1.63577e-07 1.07737e-07 2.39276e-07 2.50198e-07
FFTW
-1.57349e-06 -7.5964e-06 -1.57349e-06 1.68876e-06 5.82335e-22 1.68876e-06
2.37158e-06 6.35275e-22 2.37158e-06 -1.18579e-06 1.05879e-22 -1.18579e-06
-1.57349e-06 -7.5964e-06 -1.57349e-06 1.68876e-06 1.97573e-22 1.68876e-06
3.14928e-06 2.37158e-06 3.14928e-06 -4.94164e-07 5.82335e-22 -4.94164e-07
2.11758e-22 -8.47033e-22 -1.05879e-22 5.29396e-22 1.41851e-23 1.05879e-22
3.14928e-06 2.37158e-06 3.14928e-06 -4.94164e-07 1.05879e-22 -4.94164e-07
After mult Cuda
8.34303e-08 -5.99259e-07 -4.78761e-07 5.30254e-07 9.55877e-07 4.28985e-07
-1.56375e-07 1.19699e-07 2.39276e-07 -1.68662e-08 -7.56988e-08 -3.69897e-07
-2.66505e-07 -2.33361e-07 -5.21763e-07 -5.29126e-07 1.8915e-07 1.68158e-07
-9.01859e-07 -2.37453e-07 -3.50661e-08 -4.11154e-07 4.14802e-07 -7.9879e-07
2.09404e-07 6.52034e-08 1.8915e-07 4.97805e-07 3.32612e-07 -2.33361e-07
-1.95738e-07 -3.69897e-07 -1.63577e-07 1.07737e-07 2.39276e-07 2.50198e-07
CUDA
-1.57349e-06 -7.5964e-06 -1.57349e-06 1.68876e-06 3.33981e-13 1.68876e-06
2.37158e-06 2.84217e-13 2.37158e-06 -1.18579e-06 1.10294e-13 -1.18579e-06
-1.57349e-06 -7.5964e-06 -1.57349e-06 1.68876e-06 -9.03043e-14 1.68876e-06
3.14928e-06 2.37158e-06 3.14928e-06 -4.94164e-07 4.62975e-13 -4.94164e-07
-3.2685e-13 -1.03562e-13 -3.59548e-13 -2.13163e-13 4.27658e-15 -2.43358e-14
3.14928e-06 2.37158e-06 3.14928e-06 -4.94164e-07 3.49288e-13 -4.94164e-07
As it can be seen, both forward fft and multiplication goes right, but in case of inverse fft by cuda smth went wrong.
P.S. Sorry for poor code style