0

I have a float array and want to FFT from this with an amount of data and padding by zero padding to 2^N. I also want to overlap the data by a selectable factor. enter image description here

So far I have a cuda kernel with which I create another array in which I store the overlapped and padded data. Afterwards a cufftPlanMany is executed. By the two factors, the amount of data becomes very large and it is in principle only copies of the original data and zeros with which I waste my entire memory bandwidth. I could not find anything if cuFFT supports zero padding or if I have a possibility to create custom scripts.

(Nvidia Quadro P5000, C++14, Kubuntu)

Update

I have written a callback function which is called when loading the data into the FFT. Unfortunately this is still a little bit slower than my previous solution with a kernel which prepares the data in another array and then calls the FFT.

I need an average of 2.4ms for the example with the given values. My hope was that if I process the data on the fly, my memory bandwidth will not limit me anymore. Unfortunately this does not look like that at the moment.

Does anyone have an idea how I can speed this up even more?

// Don't forget to include cufft_static(not cufft), culibos and set flag -dc
#include <stdio.h>
#include <cstdint>
#include <unistd.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <math.h>

typedef struct fft_CB_LD_callerInfo{
    uint16_t rgLen;
    uint16_t rgDataLen;
    uint16_t overlapFactor;
};

static __device__  cufftReal myOwnCallback(void *dataIn,
                                     size_t offset,
                                     void *callerInfo,
                                     void *sharedPtr) {

    const fft_CB_LD_callerInfo *fftInfo = (fft_CB_LD_callerInfo*)callerInfo;
    int idx_rg = offset/fftInfo->rgLen;
    int idx_realRg = idx_rg/fftInfo->overlapFactor;
    int idx_posInRg = offset-(size_t)idx_rg*fftInfo->rgLen;

    if(idx_posInRg < fftInfo->rgDataLen){
        const size_t idx_data = idx_posInRg
                + idx_realRg*fftInfo->rgDataLen
                + idx_rg - (idx_realRg*fftInfo->overlapFactor)*fftInfo->rgDataLen/fftInfo->overlapFactor;

        return ((cufftReal*)dataIn)[idx_data];
    }
    else{
        return 0.0f;
    }
 }

__device__ cufftCallbackLoadR myOwnCallbackPtr = myOwnCallback;

int main(){

    // Data
    float *dataHost;
    float *data;
    cufftComplex *spectrum;
    cufftComplex *spectrumHost;
    unsigned int rgDataLen = 400;
    unsigned int rgLen = 2048;
    unsigned int overlap = 8;
    int peakPosHost[] = {0};
    int *peakPos;
    unsigned int rgCountClean = 52*16*4;
    unsigned int rgCount = rgCountClean*overlap-(overlap-1);
    int peakCountHost = 1;
    int *peakCount;

    // for FFT
    cudaStream_t stream;
    cufftHandle plan;
    cufftResult result;
    int fftRank = 1;        // --- 1D FFTs
    int fftIRide = 1, fftORide = 1;           // --- Distance between two successive input/output elements
    int fftInembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int fftOnembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int fftEachLen[] = { (int)rgLen };                 // --- Size of the Fourier transform
    int fftIDist = rgLen;
    int fftODist = rgLen/2+1; // --- Distance between batches

    // for Custom callback
    cufftCallbackLoadR hostCopyOfCallbackPtr;
    size_t worksize;
    fft_CB_LD_callerInfo *fftInfo;
    fft_CB_LD_callerInfo *fftInfoHost;

    // Allocate host memory
    dataHost = new float[rgDataLen*rgCountClean*peakCountHost];
    spectrumHost = new cufftComplex[fftODist*rgCount];
    fftInfoHost = new fft_CB_LD_callerInfo;

    // create array with example data
    for(int k=0; k<rgDataLen;k++){
        for(int i=0; i<rgCountClean; i++){
            dataHost[i*rgDataLen + k] = sin((2+i*4)*M_PI*k/rgDataLen);
        }
    }

    fftInfoHost->overlapFactor = overlap;
    fftInfoHost->rgDataLen = rgDataLen;
    fftInfoHost->rgLen = rgLen;

    // allocate device memory
    cudaMalloc((void **)&data, sizeof(float) * rgDataLen*rgCountClean*peakCountHost);
    cudaMalloc((void **)&peakPos, sizeof(int) * peakCountHost);
    cudaMalloc((void **)&peakCount, sizeof(int));
    cudaMalloc((void **)&spectrum, sizeof(cufftComplex)*fftODist*rgCount);
    cudaMalloc((void **)&fftInfo, sizeof(fft_CB_LD_callerInfo));

    // copy date from host to device
    cudaMemcpy(data, dataHost, sizeof(float)*rgDataLen*rgCountClean*peakCountHost, cudaMemcpyHostToDevice);
    cudaMemcpy(peakPos, peakPosHost, sizeof(int)*peakCountHost, cudaMemcpyHostToDevice);
    cudaMemcpy(peakCount, &peakCountHost, sizeof(peakCountHost), cudaMemcpyHostToDevice);
    cudaMemcpy(fftInfo, fftInfoHost, sizeof(fft_CB_LD_callerInfo), cudaMemcpyHostToDevice);

    // get device pointer to custom callback function
    cudaError_t error = cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr, myOwnCallbackPtr, sizeof(hostCopyOfCallbackPtr));
    if(error != 0) printf("cudaMemcpyFromSymbol faild with %d!\n", (int)error);

    // Create a plan of FFTs to fast execute there later
    cufftCreate(&plan);
    result = cufftMakePlanMany(plan, fftRank, fftEachLen, fftInembed, fftIRide, fftIDist, fftOnembed, fftORide, fftODist, CUFFT_R2C, rgCount, &worksize);
    if(result != CUFFT_SUCCESS) printf("cufftMakePlanMany failed with %d!\n", (int)result);

    result = cufftXtSetCallback(plan, (void**)&hostCopyOfCallbackPtr, CUFFT_CB_LD_REAL, (void**)&fftInfo);
    if(result != CUFFT_SUCCESS) printf("cufftXtSetCallback failed with %d!\n", (int)result);


    // ----- Begin test area ---------------------------------------------------

    if(cufftExecR2C(plan, data, spectrum) != CUFFT_SUCCESS)
        printf("cufftExecR2C is failed!\n");

    // ----- End test area ---------------------------------------------------


    return 0;
}
fex95
  • 9
  • 3
  • I don't believe cufft has support for "automatic" zero padding, however [cufft callbacks](https://docs.nvidia.com/cuda/cufft/index.html#callback-routines) (specifically a load callback) may be useful. I'm not able to give you any further advice because I don't really understand your overlap description. If its anything like what I am imagining, it seems like it would very simple to include an example data set in your question. – Robert Crovella Sep 30 '21 at 01:03
  • Thanks for your quick reply. I have added an image with a very simplified example and I hope you can follow me. Unfortunately I don't have permission to include the image directly. The overlapping allows a virtual refinement of the resolution and is easy to implement via the cuFFT plan in which I reduce the step size of the source but unfortunately I can't find a possibility of zero paddings and in combination the overlapping would be a challenge too. – fex95 Sep 30 '21 at 01:28
  • I'm afraid I missed your first part. The cuFFT callbacks sound really interesting, I don't know how I could have missed that in the description. From the API description it should be possible to implement zeropadding. I'll have a look at it tomorrow and see if I can implement it. – fex95 Sep 30 '21 at 01:41
  • @RobertCrovella I have updated the question and hope that you maybe can help me with the new problem. I have read [here](https://meta.stackexchange.com/questions/103400/if-i-edit-my-answer-do-commenters-get-notified) that you get no notification about such an update so I mention you here directly. – fex95 Oct 04 '21 at 16:23
  • Your callback methodology is to precompute the necessary indexing/zero padding, then load that array in the callback, and use that information to either load the data or use zero. My suggestion would be to create a callback routine that accepts the necessary parameters for index computation, and compute the index on-the-fly, in the callback, rather than loading the extra precomputed-index array. A CUFFT operation is often memory bound, and so requiring the load callback to load an extra element per thread will double the FFT cost. calculation on GPUs is cheaper than memory ops. – Robert Crovella Oct 04 '21 at 20:29
  • @RobertCrovella Thank you for response. I thought it would be stupid to make a calculation which always gives the same results on the fly and wanted to reduce the load on the GPU. But since I am obviously limited by the memory and not the computing power, I will implement the approach and test if I get better results. Thanks for the new idea. – fex95 Oct 04 '21 at 21:19
  • @RobertCrovella I have done the implementation and the duration is now at 2.4ms. I have analyzed the kernel with Nsight Compute and now have a significantly lower memory bandwidth usage. However, I understand the hints in the program that I do too less per thread in relation to the overhead. Such problems I had already often and normally I simply process then more elements / tasks per thread but I think that will not be possible here. Do you have any other ideas? – fex95 Oct 05 '21 at 16:45

0 Answers0