Alea GPU version until 2.2 (newest for now) doesn't support malloc array2d yet, so you have to flatten the index by row and col by yourself in kernel. For host side, you can make some extension method, to use some CUDA Driver API P/Invoke (These P/Invoke function is available from Alea.CUDA.dll) to trasfer a pinned .NET array to or from device.
So here is a quick workround I wrote:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using Alea.CUDA;
using Alea.CUDA.IL;
using NUnit.Framework;
namespace ConsoleApplication1
{
static class Extension
{
public static DeviceMemory<T> Malloc<T>(this Worker worker, T[,] array2D)
{
var rows = array2D.GetLength(0);
var cols = array2D.GetLength(1);
var dmem = worker.Malloc<T>(rows*cols);
var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
try
{
var hostPtr = handle.AddrOfPinnedObject();
var devicePtr = dmem.Handle;
// we now pinned .NET array, and need to copy them with CUDA Driver API
// to do so we need use worker.Eval to make sure the worker's context is
// pushed onto current thread.
worker.EvalAction(() =>
{
CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyHtoD(devicePtr, hostPtr,
new IntPtr(Intrinsic.__sizeof<T>()*rows*cols)));
});
}
finally
{
handle.Free();
}
return dmem;
}
public static DeviceMemory<T> Malloc<T>(this Worker worker, int rows, int cols)
{
return worker.Malloc<T>(rows*cols);
}
public static void Gather<T>(this DeviceMemory<T> dmem, T[,] array2D)
{
var rows = array2D.GetLength(0);
var cols = array2D.GetLength(1);
var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
try
{
var hostPtr = handle.AddrOfPinnedObject();
var devicePtr = dmem.Handle;
// we now pinned .NET array, and need to copy them with CUDA Driver API
// to do so we need use worker.Eval to make sure the worker's context is
// pushed onto current thread.
dmem.Worker.EvalAction(() =>
{
CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyDtoH(hostPtr, devicePtr,
new IntPtr(Intrinsic.__sizeof<T>() * rows * cols)));
});
}
finally
{
handle.Free();
}
}
}
class Program
{
static int FlattenIndex(int row, int col, int cols)
{
return row*cols + col;
}
[AOTCompile]
static void Kernel(deviceptr<double> outputs, deviceptr<double> inputs, int rows, int cols)
{
// for simplicity, I do all things in one thread.
for (var row = 0; row < rows; row++)
{
for (var col = 0; col < cols; col++)
{
outputs[FlattenIndex(row, col, cols)] = inputs[FlattenIndex(row, col, cols)];
}
}
}
[Test]
public static void Test()
{
var worker = Worker.Default;
// make it small, for we only do it in one GPU thread.
const int rows = 10;
const int cols = 5;
var rng = new Random();
var inputs = new double[rows, cols];
for (var row = 0; row < rows; ++row)
{
for (var col = 0; col < cols; ++col)
{
inputs[row, col] = rng.Next(1, 100);
}
}
var dInputs = worker.Malloc(inputs);
var dOutputs = worker.Malloc<double>(rows, cols);
var lp = new LaunchParam(1, 1);
worker.Launch(Kernel, lp, dOutputs.Ptr, dInputs.Ptr, rows, cols);
var outputs = new double[rows, cols];
dOutputs.Gather(outputs);
Assert.AreEqual(inputs, outputs);
}
public static void Main(string[] args)
{
}
}
}