1D FFT plus kernel calculation with managedCUDA

Question

I am trying to make FFT plus kernel calculation. FFT : managedCUDA library kernel calc : own kernel

C# code

public void cuFFTreconstruct() {
                CudaContext ctx = new CudaContext(0);
                CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion");

                float[] fData = new float[Resolution * Resolution * 2];
                float[] result = new float[Resolution * Resolution * 2];
                CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
                CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);

                int i, j;
                Random rnd = new Random();
                double avrg = 0.0;

                for (i = 0; i < Resolution; i++)
                {
                    for (j = 0; j < Resolution; j++)
                    {
                        fData[(i * Resolution + j) * 2] = i + j * 2;
                        fData[(i * Resolution + j) * 2 + 1] = 0.0f;
                    }
                }

                devData.CopyToDevice(fData);

                CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2);
                plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);

                cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
                cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);

                cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);

                devData.CopyToHost(result);

                for (i = 0; i < Resolution; i++)
                {
                    for (j = 0; j < Resolution; j++)
                    {
                        ResultData[i, j, 0] = result[(i * Resolution + j) * 2];
                        ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1];
                    }
                }   
                ctx.FreeMemory(devData.DevicePointer);
                ctx.FreeMemory(copy_devData.DevicePointer);
            }

kernel code

    //Includes for IntelliSense 
    #define _SIZE_T_DEFINED
    #ifndef __CUDACC__
    #define __CUDACC__
    #endif
    #ifndef __cplusplus
    #define __cplusplus
    #endif


    #include <cuda.h>
    #include <device_launch_parameters.h>
    #include <texture_fetch_functions.h>
    #include "float.h"
    #include <builtin_types.h>
    #include <vector_functions.h>

    // Texture reference
    texture<float2, 2> texref;

    extern "C"
    {
        __global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution)
        {
            int image_x = blockIdx.x * blockDim.x + threadIdx.x;
            int image_y = blockIdx.y;

            data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2];
            data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1];
        }
    }

However this program does not work well. Following error was occurred:

ErrorLaunchFailed: An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA.

score 2 · Answer 1 · answered Nov 13 '15 at 23:06

The FFT-plan takes the number of elements, i.e. number of complex numbers, as argument. So remove the * 2 in the first argument of the plan's constructor. And the times two for the number of batches also doesn't make sense...

Further I'd use the float2 or cuFloatComplex type (in ManagedCuda.VectorTypes) to represent the complex numbers and not two raw floats. And to free memory, use the Dispose methods of CudaDeviceVariable. Otherwise it will be called internally by the GC somewhat later.

The host code would then look something like this:

int Resolution = 512;
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel.ptx", "cu_ArrayInversion");

//float2 or cuFloatComplex
float2[] fData = new float2[Resolution * Resolution];
float2[] result = new float2[Resolution * Resolution];
CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);

int i, j;
Random rnd = new Random();
double avrg = 0.0;

for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
    fData[(i * Resolution + j)].x = i + j * 2;
    fData[(i * Resolution + j)].y = 0.0f;
}
}

devData.CopyToDevice(fData);

//Only Resolution times in X and Resolution batches
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);

cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);

cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);

devData.CopyToHost(result);

for (i = 0; i < Resolution; i++)
{
    for (j = 0; j < Resolution; j++)
    {
        //ResultData[i, j, 0] = result[(i * Resolution + j)].x;
        //ResultData[i, j, 1] = result[(i * Resolution + j)].y;
    }
}

//And better free memory using Dispose()
//ctx.FreeMemory is only meant for raw device pointers obtained from somewhere else...
devData.Dispose();
copy_devData.Dispose();
plan1D.Dispose();
//For Cuda Memory checker and profiler:
CudaContext.ProfilerStop();
ctx.Dispose();

Please post also your updated host code or double check that it fits to the code above. If I let both of your kernels run with the host code I posted here, everything runs fine. Cuda Memory checker doesn't find anything and no error message. — kunzmi, Nov 17 '15 at 18:16
Thank you for this comments. I checked my code. However I could not find mistake. My program was made by reference to the sites : (setup : https://managedcuda.codeplex.com/documentatio, dll : https://github.com/kunzmi/managedCuda, sample code : https://github.com/kunzmi/managedCuda) . When I make 2D cuda and 1D cuda with managedCUDA, the programs was work well and I could get the good FFT result. — test, Nov 18 '15 at 13:34
Please post your host code exactly as you are using it so that one can reproduce your problem. Again: The host code I posted above runs without problems together with your kernels, so there must be a difference. — kunzmi, Nov 18 '15 at 21:47
I managed now to reproduce your problem: If I compile the host code to 64 bit and the cuda device code with 32 bit address space, I get the same error. Can you try to check this and compile both codes to the same hardware architecture? Actually an error should already be thrown when loading the kernel from PTX, but Nvidia seems to have changed that... — kunzmi, Nov 19 '15 at 19:56
When I changed the architecture from 32 bit to 64 bit (kernel and host), my program work well. Thank you for your support!! — test, Nov 21 '15 at 10:23

test · Answer 2 · 2015-11-18T13:55:09.270

Thank you for this suggestion.

I tried suggested code. However, the error was remain. (error : ErrorLaunchFailed: An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA.)

To use the float2, I changed the cu code as follows

 extern "C"
{

__global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
    {
    int image_x = blockIdx.x * blockDim.x + threadIdx.x;
    int image_y = blockIdx.y;

    data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].x;
    data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].y;
}

When program executes the "cuKernel.Run", the process stoped.

ptx file

.version 4.3
.target sm_20
.address_size 32

    // .globl   cu_ArrayInversion
.global .texref texref;

.visible .entry cu_ArrayInversion(
    .param .u32 cu_ArrayInversion_param_0,
    .param .u32 cu_ArrayInversion_param_1,
    .param .u32 cu_ArrayInversion_param_2
)
{
    .reg .f32   %f<5>;
    .reg .b32   %r<17>;


    ld.param.u32    %r1, [cu_ArrayInversion_param_0];
    ld.param.u32    %r2, [cu_ArrayInversion_param_1];
    ld.param.u32    %r3, [cu_ArrayInversion_param_2];
    cvta.to.global.u32  %r4, %r2;
    cvta.to.global.u32  %r5, %r1;
    mov.u32     %r6, %ctaid.x;
    mov.u32     %r7, %ntid.x;
    mov.u32     %r8, %tid.x;
    mad.lo.s32  %r9, %r7, %r6, %r8;
    mov.u32     %r10, %ctaid.y;
    mad.lo.s32  %r11, %r10, %r3, %r9;
    shl.b32     %r12, %r11, 3;
    add.s32     %r13, %r5, %r12;
    mad.lo.s32  %r14, %r9, %r3, %r10;
    shl.b32     %r15, %r14, 3;
    add.s32     %r16, %r4, %r15;
    ld.global.v2.f32    {%f1, %f2}, [%r13];
    st.global.v2.f32    [%r16], {%f1, %f2};
    ret;
}

score 0 · Accepted Answer · answered Nov 19 '15 at 14:30

Thank you for the message.

host code

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Drawing.Imaging;
using ManagedCuda;
using ManagedCuda.CudaFFT;
using ManagedCuda.VectorTypes;


namespace WFA_CUDA_FFT
{
    public partial class CuFFTMain : Form
    {
        float[, ,] FFTData2D;
        int Resolution;

        const int cuda_blockNum = 256;

        public CuFFTMain()
        {
            InitializeComponent();
            Resolution = 1024;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            cuFFTreconstruct();
        }
        public void cuFFTreconstruct()
        {
            CudaContext ctx = new CudaContext(0);
            ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx");
            CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx);
            float2[] fData = new float2[Resolution * Resolution];
            float2[] result = new float2[Resolution * Resolution];
            FFTData2D = new float[Resolution, Resolution, 2];
            CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
            CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);

            int i, j;
            Random rnd = new Random();
            double avrg = 0.0;

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    fData[i * Resolution + j].x = i + j * 2;
                    avrg += fData[i * Resolution + j].x;
                    fData[i * Resolution + j].y = 0.0f;
                }
            }

            avrg = avrg / (double)(Resolution * Resolution);

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg;
                }
            }

            devData.CopyToDevice(fData);

            CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
            plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);

            cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1);
            cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1);

            cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);

            copy_devData.CopyToHost(result);

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    FFTData2D[i, j, 0] = result[i * Resolution + j].x;
                    FFTData2D[i, j, 1] = result[i * Resolution + j].y;
                }
            }

            //Clean up
            devData.Dispose();
            copy_devData.Dispose();
            plan1D.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();
        }
    }
}

kernel code

//Includes for IntelliSense 
#define _SIZE_T_DEFINED
#ifndef __CUDACC__
#define __CUDACC__
#endif
#ifndef __cplusplus
#define __cplusplus
#endif


#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
#include <vector>

// Texture reference
texture<float2, 2> texref;

extern "C"
{
    // Device code

    __global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
    {
        int image_x = blockIdx.x * blockDim.x + threadIdx.x;
        int image_y = blockIdx.y;

        data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].x;
        data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].y;
    }
}

First I compiled with .Net4.5. This program did not work, and error (System.BadImageFormatException) was showed. However when the FFT function is comment out, the kernel program run.

Second I chaneged from .Net 4.5 to .Net 4.0. The FFT function works, but kernel does not run and shows errors.

My PC is windows 8.1 pro and I use visual studio 2013.

1D FFT plus kernel calculation with managedCUDA

3 Answers3