I am trying to make FFT plus kernel calculation. FFT : managedCUDA library kernel calc : own kernel
C# code
public void cuFFTreconstruct() {
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion");
float[] fData = new float[Resolution * Resolution * 2];
float[] result = new float[Resolution * Resolution * 2];
CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j) * 2] = i + j * 2;
fData[(i * Resolution + j) * 2 + 1] = 0.0f;
}
}
devData.CopyToDevice(fData);
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
ResultData[i, j, 0] = result[(i * Resolution + j) * 2];
ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1];
}
}
ctx.FreeMemory(devData.DevicePointer);
ctx.FreeMemory(copy_devData.DevicePointer);
}
kernel code
//Includes for IntelliSense
#define _SIZE_T_DEFINED
#ifndef __CUDACC__
#define __CUDACC__
#endif
#ifndef __cplusplus
#define __cplusplus
#endif
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
// Texture reference
texture<float2, 2> texref;
extern "C"
{
__global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2];
data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1];
}
}
However this program does not work well. Following error was occurred:
ErrorLaunchFailed: An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA.