I have a float array and want to FFT from this with an amount of data and padding by zero padding to 2^N. I also want to overlap the data by a selectable factor.
So far I have a cuda kernel with which I create another array in which I store the overlapped and padded data. Afterwards a cufftPlanMany is executed. By the two factors, the amount of data becomes very large and it is in principle only copies of the original data and zeros with which I waste my entire memory bandwidth. I could not find anything if cuFFT supports zero padding or if I have a possibility to create custom scripts.
(Nvidia Quadro P5000, C++14, Kubuntu)
Update
I have written a callback function which is called when loading the data into the FFT. Unfortunately this is still a little bit slower than my previous solution with a kernel which prepares the data in another array and then calls the FFT.
I need an average of 2.4ms for the example with the given values. My hope was that if I process the data on the fly, my memory bandwidth will not limit me anymore. Unfortunately this does not look like that at the moment.
Does anyone have an idea how I can speed this up even more?
// Don't forget to include cufft_static(not cufft), culibos and set flag -dc
#include <stdio.h>
#include <cstdint>
#include <unistd.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <math.h>
typedef struct fft_CB_LD_callerInfo{
uint16_t rgLen;
uint16_t rgDataLen;
uint16_t overlapFactor;
};
static __device__ cufftReal myOwnCallback(void *dataIn,
size_t offset,
void *callerInfo,
void *sharedPtr) {
const fft_CB_LD_callerInfo *fftInfo = (fft_CB_LD_callerInfo*)callerInfo;
int idx_rg = offset/fftInfo->rgLen;
int idx_realRg = idx_rg/fftInfo->overlapFactor;
int idx_posInRg = offset-(size_t)idx_rg*fftInfo->rgLen;
if(idx_posInRg < fftInfo->rgDataLen){
const size_t idx_data = idx_posInRg
+ idx_realRg*fftInfo->rgDataLen
+ idx_rg - (idx_realRg*fftInfo->overlapFactor)*fftInfo->rgDataLen/fftInfo->overlapFactor;
return ((cufftReal*)dataIn)[idx_data];
}
else{
return 0.0f;
}
}
__device__ cufftCallbackLoadR myOwnCallbackPtr = myOwnCallback;
int main(){
// Data
float *dataHost;
float *data;
cufftComplex *spectrum;
cufftComplex *spectrumHost;
unsigned int rgDataLen = 400;
unsigned int rgLen = 2048;
unsigned int overlap = 8;
int peakPosHost[] = {0};
int *peakPos;
unsigned int rgCountClean = 52*16*4;
unsigned int rgCount = rgCountClean*overlap-(overlap-1);
int peakCountHost = 1;
int *peakCount;
// for FFT
cudaStream_t stream;
cufftHandle plan;
cufftResult result;
int fftRank = 1; // --- 1D FFTs
int fftIRide = 1, fftORide = 1; // --- Distance between two successive input/output elements
int fftInembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int fftOnembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int fftEachLen[] = { (int)rgLen }; // --- Size of the Fourier transform
int fftIDist = rgLen;
int fftODist = rgLen/2+1; // --- Distance between batches
// for Custom callback
cufftCallbackLoadR hostCopyOfCallbackPtr;
size_t worksize;
fft_CB_LD_callerInfo *fftInfo;
fft_CB_LD_callerInfo *fftInfoHost;
// Allocate host memory
dataHost = new float[rgDataLen*rgCountClean*peakCountHost];
spectrumHost = new cufftComplex[fftODist*rgCount];
fftInfoHost = new fft_CB_LD_callerInfo;
// create array with example data
for(int k=0; k<rgDataLen;k++){
for(int i=0; i<rgCountClean; i++){
dataHost[i*rgDataLen + k] = sin((2+i*4)*M_PI*k/rgDataLen);
}
}
fftInfoHost->overlapFactor = overlap;
fftInfoHost->rgDataLen = rgDataLen;
fftInfoHost->rgLen = rgLen;
// allocate device memory
cudaMalloc((void **)&data, sizeof(float) * rgDataLen*rgCountClean*peakCountHost);
cudaMalloc((void **)&peakPos, sizeof(int) * peakCountHost);
cudaMalloc((void **)&peakCount, sizeof(int));
cudaMalloc((void **)&spectrum, sizeof(cufftComplex)*fftODist*rgCount);
cudaMalloc((void **)&fftInfo, sizeof(fft_CB_LD_callerInfo));
// copy date from host to device
cudaMemcpy(data, dataHost, sizeof(float)*rgDataLen*rgCountClean*peakCountHost, cudaMemcpyHostToDevice);
cudaMemcpy(peakPos, peakPosHost, sizeof(int)*peakCountHost, cudaMemcpyHostToDevice);
cudaMemcpy(peakCount, &peakCountHost, sizeof(peakCountHost), cudaMemcpyHostToDevice);
cudaMemcpy(fftInfo, fftInfoHost, sizeof(fft_CB_LD_callerInfo), cudaMemcpyHostToDevice);
// get device pointer to custom callback function
cudaError_t error = cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr, myOwnCallbackPtr, sizeof(hostCopyOfCallbackPtr));
if(error != 0) printf("cudaMemcpyFromSymbol faild with %d!\n", (int)error);
// Create a plan of FFTs to fast execute there later
cufftCreate(&plan);
result = cufftMakePlanMany(plan, fftRank, fftEachLen, fftInembed, fftIRide, fftIDist, fftOnembed, fftORide, fftODist, CUFFT_R2C, rgCount, &worksize);
if(result != CUFFT_SUCCESS) printf("cufftMakePlanMany failed with %d!\n", (int)result);
result = cufftXtSetCallback(plan, (void**)&hostCopyOfCallbackPtr, CUFFT_CB_LD_REAL, (void**)&fftInfo);
if(result != CUFFT_SUCCESS) printf("cufftXtSetCallback failed with %d!\n", (int)result);
// ----- Begin test area ---------------------------------------------------
if(cufftExecR2C(plan, data, spectrum) != CUFFT_SUCCESS)
printf("cufftExecR2C is failed!\n");
// ----- End test area ---------------------------------------------------
return 0;
}