CUFFT wrong result for batch 1D ifft

Question

I am new to CUDA and CUFFT, when I was trying to recover the fft result of cufftExecC2R(...) by applying the corresponding cufftExecC2R(...), it went wrong, the recovered data and the original data is not identical.

Here is the code, the cuda library I used was cuda-9.0.

#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "cuda.h"
#include "cufft.h"

#include <iostream>
#include <sys/time.h>
#include <cstdio>
#include <cmath>

using namespace std;


// cuda error check
#define gpuErrchk(ans) {gpuAssrt((ans), __FILE__, __LINE__);}
inline void gpuAssrt(cudaError_t code, const char* file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) {
            exit(code);
        }
    }
}

// ifft scale for cufft
__global__ void IFFTScale(int scale_, cufftReal* real) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    real[idx] *= 1.0 / scale_;
}


void batch_1d_irfft2_test() {
    const int BATCH = 3;
    const int DATASIZE = 4;

    /// RFFT
    // --- Host side input data allocation and initialization
    cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
    for (int i = 0; i < BATCH; ++ i) {
        for (int j = 0; j < DATASIZE; ++ j) {
            hostInputData[i * DATASIZE + j] = (cufftReal)(i * DATASIZE  + j + 1);
        }
    }

    // DEBUG:print host input data
    cout << "print host input data" << endl;
    for (int i = 0; i < BATCH; ++ i) {
        for (int j = 0; j < DATASIZE; ++ j) {
            cout << hostInputData[i * DATASIZE + j] << ", ";
        }
        cout << endl;
    }
    cout << "=====================================================" << endl;

    // --- Device side input data allocation and initialization
    cufftReal *deviceInputData; 
    gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));

    // --- Device side output data allocation
    cufftComplex *deviceOutputData; 
    gpuErrchk(cudaMalloc(
                (void**)&deviceOutputData, 
                (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));

    // Host sice input data copied to Device side 
    cudaMemcpy(deviceInputData, 
            hostInputData, 
            DATASIZE * BATCH * sizeof(cufftReal), 
            cudaMemcpyHostToDevice);

    // --- Batched 1D FFTs
    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = {DATASIZE};                 // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = DATASIZE, odist = DATASIZE / 2 + 1; // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = BATCH;                      // --- Number of batched executions
    cufftPlanMany(
            &handle, 
            rank, 
            n, 
            inembed, istride, idist, 
            onembed, ostride, odist, 
            CUFFT_R2C, 
            batch);
    cufftExecR2C(handle, deviceInputData, deviceOutputData);

    // **************************************************************************
    /// IRFFT
    cufftReal *deviceOutputDataIFFT; 
    gpuErrchk(cudaMalloc((void**)&deviceOutputDataIFFT, DATASIZE * BATCH * sizeof(cufftReal)));

    // --- Batched 1D IFFTs
    cufftHandle handleIFFT;
    int n_ifft[] = {DATASIZE / 2 + 1};                 // --- Size of the Fourier transform
    idist = DATASIZE / 2 + 1; odist = DATASIZE; // --- Distance between batches
    cufftPlanMany(
            &handleIFFT, 
            rank, 
            n_ifft, 
            inembed, istride, idist, 
            onembed, ostride, odist, 
            CUFFT_C2R, 
            batch);
    cufftExecC2R(handleIFFT, deviceOutputData, deviceOutputDataIFFT);

    /* scale
    // dim3 dimGrid(512);
    // dim3 dimBlock(max((BATCH * DATASIZE + 512  - 1) / 512, 1));
    // IFFTScale<<<dimGrid, dimBlock>>>((DATASIZE - 1) * 2, deviceOutputData);
    */

    // host output data for ifft
    cufftReal *hostOutputDataIFFT = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
    cudaMemcpy(hostOutputDataIFFT, 
            deviceOutputDataIFFT, 
            DATASIZE * BATCH * sizeof(cufftReal), 
            cudaMemcpyDeviceToHost);

    // print IFFT recovered host output data
    cout << "print host output IFFT data" << endl;
    for (int i=0; i<BATCH; i++) {
        for (int j=0; j<DATASIZE; j++) {
            cout << hostOutputDataIFFT[i * DATASIZE + j] << ", ";
        }
        printf("\n");
    }

    cufftDestroy(handle);
    gpuErrchk(cudaFree(deviceOutputData));
    gpuErrchk(cudaFree(deviceInputData));
    gpuErrchk(cudaFree(deviceOutputDataIFFT));
    free(hostOutputDataIFFT);
    free(hostInputData);
}

int main() {
    batch_1d_irfft2_test();

    return 0;
}

I compile the 'rfft_test.cu' file by nvcc -o rfft_test rfft_test.cu -lcufft. the result is as below:

print host input data
1, 2, 3, 4, 
5, 6, 7, 8, 
9, 10, 11, 12, 
=====================================================
print IFFT recovered host output data
6, 8.5359, 15.4641, 0, 
22, 24.5359, 31.4641, 0, 
38, 40.5359, 47.4641, 0,

Specifically, I check the scale issue for the cufftExecC2R(...), and I comment out the IFFTScale() kernel function. Thus I assume that the recovered output data was like DATASIZE*input_batched_1d_data, but even so, the result is not as expected.

I have checked the cufft manual and my code several times, I also search for some Nvidia forums and StackOverflow answers, but I didn't find any solution. Anyone's help is greatly appreciated. Thanks in advance.

score 0 · Answer 1 · answered May 12 '19 at 05:30

Size of your inverse transform is incorrect and should be DATASIZE not DATASIZE/2+1.

Following sections of cuFFT docs should help:

"In C2R mode an input array ( x 1 , x 2 , … , x ⌊ N 2 ⌋ + 1 ) of only non-redundant complex elements is required." - N is transform size you pass to plan function

CUFFT wrong result for batch 1D ifft

1 Answers1