I'm writing a library that does fast matrix math on the gpu. It uses function pointers for doing operations on data in the matrixes.
I'm using Cuda 11.2 with Visual Studio 2019.
Now, my problem is that the kernel I use to do operations on the matrix elements crashes.
It seems to do so consistently when calling the function pointer that was supplied to do the operation that's requested. It does not crash when I aquire the function pointer in the function that wraps the kernel. However, if I aquire the function pointer anywhere outside of the .cu file the kernel is located in, it crashes. It also only crashes in release mode, not in debug mode.
I have absolutely no idea whats going on here, is there something i overlooked? To me this seems rather like a compiler bug as it only happens in release mode and only when I aquire the function pointer in a different .cu file that where it is being used.
Is there any workaround for my problem?
This crash is also reproducible in a different project, so it's not some old compile artifacts or something.
I also made a minimal reproducible example:
kernel.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <iostream>
#include "crash.cuh"
template<class fnType, fnType fn>
static __global__ void __cudaFnPointerKernel(volatile void** fnPointer)
{
auto fnCopy = fn;
*fnPointer = *((void**)&fnCopy);
}
template<class fnType, fnType fn>
static inline decltype(fn) cudaFnPointer()
{
volatile void** ptr = 0;
cudaMallocManaged(&ptr, sizeof(*ptr));
__cudaFnPointerKernel<fnType, fn> << <1, 1 >> > (ptr);
volatile auto err = cudaDeviceSynchronize();
decltype(fn) ptrCopy = (decltype(fn))*ptr;
cudaFree(ptr);
return ptrCopy;
}
__device__ float reLU(float x)
{
return x > 0 ? x : 0;
}
__global__ void callFn(functionPointerType fn)
{
fn(1);
}
int main()
{
functionPointerConsumer consumer;
auto fnPtr = cudaFnPointer<decltype(reLU)*, reLU>();
std::cout << "fnPtr: " << fnPtr << "\n";
callFn<<<1, 1>>>(fnPtr);
std::cout << "Kernel Error 1: " << cudaDeviceSynchronize() << "\n";
consumer.crash(fnPtr);
std::cout << "Kernel Error 2: " << cudaDeviceSynchronize() << "\n";
}
crash.cuh:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <iostream>
typedef float(*functionPointerType)(float);
struct functionPointerConsumer
{
void crash(functionPointerType fn);
};
crash.cu:
#include "crash.cuh"
__global__ void crashKernel(functionPointerType fn)
{
fn(1);
printf("Kernel did not crash\n");
}
void functionPointerConsumer::crash(functionPointerType fn)
{
crashKernel<<<1, 1>>>(fn);
}
Expected output/output in debug mode:
fnPtr: 00000007009A2E00
Kernel Error 1: 0
Kernel did not crash
Kernel Error 2: 0
Actual output/output in release mode:
fnPtr: 0000000000000008
Kernel Error 1: 0
Kernel Error 2: 718
The error code with which the kernel crashes is 718/cudaErrorInvalidPc. I don't know why it is cudaErrorInvalidPc though.