1

I am testing AleaTK for a task I need done.

The GitHub repo is Here.

If I download the Repo and test the PTB example, it works fine!

If I use Nuget and download the AleaTK Nuget and add the Classes: 'Lstm.cs', 'LstmRnnType.cs' with all the extensions necessary, I get an error:

The method or operation is not implemented.

Now, the call is coming from:

Optimizer.Forward();

and Optimizer is:

Optimizer = new GradientDescentOptimizer(ctx, Loss.Loss, cfg.LearningRate, new GlobalNormGradientClipper(cfg.MaxGradNorm));

The same code as in the PTB project.

The Forward method, is overridden as far as I can tell:

public override void Forward(Executor executor)

and

public override void Forward(Executor executor)

In both classes. Lstm and Rnn.

In looking through the code, there is a LOT of throw new NotImplementedException() with zero indication of where the problem is coming from.

I wonder if there is a quick way I can find exactly where this error is coming from and why, what exactly I have missed.

at System.Threading.Tasks.Task.ThrowIfExceptional(Boolean includeTaskCanceledExceptions) at System.Threading.Tasks.Task.Wait(Int32 millisecondsTimeout, CancellationToken cancellationToken) at AleaTK.Assignment.Run()
at AleaTK.ML.Executor.AssignTensor[T](Variable1 variable, Expr1 expr) at AleaTK.ML.Executor.Forward() at AleaConsole.Model..ctor(Context ctx, Config cfg, Boolean isTraining, Boolean usingCuDnn) in D:\C# Projects\Machine Learning Examples\Alea TK for ML\AleaConsole\AleaConsole\Program.cs:line 693 at AleaConsole.Program.Main(String[] args) in D:\C# Projects\Machine Learning Examples\Alea TK for ML\AleaConsole\AleaConsole\Program.cs:line 116

namespace AleaConsole
{



#region Using Statements:



using System;
using System.IO;
using System.Net;
using System.Linq;
using System.Text;
using System.Diagnostics;
using System.IO.Compression;
using System.Collections.Generic;

using Alea;
using AleaTK;
using AleaTK.ML;
using AleaTK.ML.Operator;
using static AleaTK.Library;
using static AleaTK.ML.Library;
using Context = AleaTK.Context;

using NUnit.Framework;
using ICSharpCode.SharpZipLib.Tar;



#endregion


public static class Extensions
{



public static void Iter<T>(this IEnumerable<T> ie, Action<T, int> action)
{

var i = 0;

foreach (var e in ie)
{
action(e, i++);
}
}
}



class Program
{



#region Fields:



public const int TestHiddenSize = -1;



public const bool Profiling = false;



public const int TestMaxMaxEpoch = Profiling ? 1 : -1;



public const string DataPath = @"Data\PTB\simple-examples\data";



#endregion



#region Properties:



#endregion




static void Main(string[] args)
{


bool isConsole = true;
bool usingCuDnn = true;


Console.WriteLine($"UsingCUDNN({usingCuDnn}), Config: small");

var ptb = new Data(DataPath);
var ctx = Context.GpuContext(0);

Config cfg, cfgValid, cfgTest, cfgInteractive;
cfg = Config.Small(batchSize: 20);
cfgValid = Config.Small(batchSize: 20);
cfgTest = Config.Small(batchSize: 1, numSteps: 1);
cfgInteractive = Config.Small(batchSize: 1, numSteps: 10);

Assert.AreEqual(ptb.WordToIdDict.Count, cfg.VocabSize);
Assert.AreEqual(ptb.WordToIdDict.Count, cfgValid.VocabSize);
Assert.AreEqual(ptb.WordToIdDict.Count, cfgTest.VocabSize);
Assert.AreEqual(ptb.WordToIdDict.Count, cfgInteractive.VocabSize);

var model = new Model(ctx, cfg, isTraining: true, usingCuDnn: usingCuDnn);
var modelValid = new Model(ctx, cfgValid, isTraining: false, usingCuDnn: usingCuDnn);
var modelTest = new Model(ctx, cfgTest, isTraining: false, usingCuDnn: usingCuDnn);
var modelInteractive = new Model(ctx, cfgInteractive, isTraining: false, usingCuDnn: usingCuDnn);

for (var i = 0; i < cfg.MaxMaxEpoch; ++i)
{
var lrDecay = Math.Pow(cfg.LrDecay, Math.Max(i - cfg.MaxEpoch, 0.0));
var learningRate = cfg.LearningRate * lrDecay;

Console.WriteLine($"Epoch: {i + 1} Learning rate: {learningRate:F3}");
var trainPerplexity = model.RunEpoch(ptb.TrainData, learningRate: learningRate, verbose: true);
Console.WriteLine($"Epoch: {i + 1} Train Perplexity: {trainPerplexity:F3}");

if (!Profiling)
{
modelValid.CopyWeightsFrom(model);
var validPerplexity = modelValid.RunEpoch(ptb.ValidData);
Console.WriteLine($"Epoch: {i + 1} Valid Perplexity: {validPerplexity:F3}");
}
}

if (!Profiling)
{
modelTest.CopyWeightsFrom(model);
Console.WriteLine("Testing with test data, this is slow, since batch size is set to small...");
var testPerplexity = modelTest.RunEpoch(ptb.TestData, verbose: true);
Console.WriteLine($"Test Perplexity: {testPerplexity:F3}");
}

if (!Profiling && isConsole)
{
var inputs = new int[cfgInteractive.NumSteps, 1];
modelInteractive.CopyWeightsFrom(model);

// since the entropy and softmax are merged, so we have to allocate the target (label) tensor
modelInteractive.Optimizer.AssignTensor(modelInteractive.Targets, inputs.AsTensor());

while (true)
{
Console.WriteLine();
Console.WriteLine($"Enter some words (less than {cfgInteractive.NumSteps} words)");
var readLine = Console.ReadLine();
if (readLine == null) break;
var line = readLine.Trim(' ', '\t', '\r', '\n');
var words = line.Split(new[] { ' ', '\t', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
if (words.Length <= 0 || words.Length > cfgInteractive.NumSteps) continue;

for (var i = 0; i < cfgInteractive.NumSteps; ++i)
{
if (i < words.Length)
{
inputs[i, 0] = ptb.WordToId(words[i]);
}
else
{
inputs[i, 0] = ptb.WordToId("<unk>");
}
}

Console.WriteLine("Your inputs are:");
for (var i = 0; i < cfgInteractive.NumSteps; ++i)
{
Console.Write($"{ptb.IdToWord(inputs[i, 0])} ");
}
Console.WriteLine();

modelInteractive.ResetStates();
modelInteractive.Optimizer.AssignTensor(modelInteractive.Inputs, inputs.AsTensor());
modelInteractive.Optimizer.Forward();

var logPred = modelInteractive.Optimizer.GetTensor(modelInteractive.Loss.LogPred).ToArray2D();
var pred = new List<IndexAndProb>();
var totalProb = 0.0;
for (var i = 0; i < cfgInteractive.VocabSize; ++i)
{
var p = new IndexAndProb { Index = i, Prob = Math.Exp(logPred[words.Length - 1, i]) };
pred.Add(p);
totalProb += p.Prob;
}
Console.WriteLine($"Total probability: {totalProb:F4}");
pred.Sort();
Console.WriteLine("Candidates are:");
pred.Take(10).Iter((x, o) => { Console.WriteLine($" {x.Prob:P2} --> {ptb.IdToWord(x.Index)}"); });
}
}
}
}



public class Data
{


private static void Decompress(string src, string dst)
{
using (var originalFileStream = File.OpenRead(src))
using (var decompressedFileStream = File.Create(dst))
using (var decompressionStream = new GZipStream(originalFileStream, CompressionMode.Decompress))
{
decompressionStream.CopyTo(decompressedFileStream);
}
}



public static void EnsureDataFile()
{
const string doneFileName = @"Data\PTB.done";
const string url = @"http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz";

if (!Directory.Exists("Data"))
{
Directory.CreateDirectory("Data");
}

if (!File.Exists(doneFileName))
{
using (var client = new WebClient())
{
Console.WriteLine($"Downloading {url} ...");
client.DownloadFile(url, @"Data\PTB.tgz");
}

Decompress(@"Data\PTB.tgz", @"Data\PTB.tar");

using (var tarFile = File.OpenRead(@"Data\PTB.tar"))
using (var tarArchive = TarArchive.CreateInputTarArchive(tarFile))
{
tarArchive.ExtractContents(@"Data\PTB");
}

using (var doneFile = File.CreateText(doneFileName))
{
doneFile.WriteLine($"{DateTime.Now}");
}
}
}



public static List<string> ReadWords(string path)
{

var totalWords = new List<string>();
using (var file = File.Open(path, FileMode.Open))
using (var reader = new StreamReader(file, Encoding.UTF8))
{
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
var words = line?.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
if (!(words?.Length > 0)) continue;
totalWords.AddRange(words);
totalWords.Add("<eos>");
}
}
return totalWords;
}



public static void BuildVocabulary(string path, out Dictionary<string, int> word2id, out Dictionary<int, string> id2word)
{
var data = ReadWords(path).Distinct().ToList();
data.Sort();
word2id = new Dictionary<string, int>();
id2word = new Dictionary<int, string>();
var id = 0;
foreach (var word in data)
{
word2id.Add(word, id);
id2word.Add(id, word);
id++;
}
}



public readonly Dictionary<string, int> WordToIdDict;



public readonly Dictionary<int, string> IdToWordDict;



public readonly int[] TrainData;



public readonly int[] ValidData;



public readonly int[] TestData;



public int WordToId(string word)
{
return WordToIdDict.ContainsKey(word) ? WordToIdDict[word] : WordToIdDict["<unk>"];
}



public string IdToWord(int id)
{
return IdToWordDict[id];
}



public Data(string dataPath)
{

EnsureDataFile();

var trainPath = Path.Combine(dataPath, "ptb.train.txt");
var validPath = Path.Combine(dataPath, "ptb.valid.txt");
var testPath = Path.Combine(dataPath, "ptb.test.txt");

BuildVocabulary(trainPath, out WordToIdDict, out IdToWordDict);

TrainData = ReadWords(trainPath).Select(WordToId).ToArray();
ValidData = ReadWords(validPath).Select(WordToId).ToArray();
TestData = ReadWords(testPath).Select(WordToId).ToArray();
}



public List<string> GetWords(int from, int to)
{

var words = new List<string>();

for (var i = from; i < to; ++i)
words.Add(IdToWordDict[TrainData[i]]);

return words;
}



public class Batch
{
public int[,] Inputs { get; set; }
public int[,] Targets { get; set; }
}



public static IEnumerable<Batch> Iterator(int[] rawData, int numSteps, int batchSize)
{

var dataLen = rawData.Length;
var batchLen = dataLen / batchSize;
var data = new int[batchSize, batchLen];
for (var i = 0; i < batchSize; ++i)
{
for (var j = 0; j < batchLen; ++j)
{
data[i, j] = rawData[batchLen * i + j];
}
}

var epochSize = (batchLen - 1) / numSteps;

Util.EnsureTrue(epochSize != 0);

for (var i = 0; i < epochSize; ++i)
{
var x = new int[numSteps, batchSize];
var y = new int[numSteps, batchSize];

for (var t = 0; t < numSteps; ++t)
{
for (var j = 0; j < batchSize; ++j)
{
x[t, j] = data[j, numSteps * i + t];
y[t, j] = data[j, numSteps * i + t + 1];
}
}

yield return new Batch { Inputs = x, Targets = y };
}
}
}



public class IndexAndProb : IComparable
{


public int Index;



public double Prob;



public int CompareTo(object obj)
{
var o = (IndexAndProb)obj;
if (Prob == o.Prob) return 0;
return Prob > o.Prob ? -1 : 1;
}


public override string ToString()
{
return $"({Index}:{Prob:F2})";
}
}



public class Config
{

public double InitScale;
public double LearningRate;
public double MaxGradNorm;
public int NumLayers;
public int NumSteps;
public int HiddenSize;
public int MaxEpoch;    // learning rate start to reduce after this epoch
public int MaxMaxEpoch; // epoches to run
public double KeepProb;
public double LrDecay;
public int BatchSize;
public int VocabSize;

public const bool Profiling = false;

public const int TestMaxMaxEpoch = Profiling ? 1 : -1;

public const int TestHiddenSize = -1;

public static Config Small(int batchSize = 20, int numSteps = 20, double keepProb = 1.0)
{

return new Config
{

InitScale = 0.1,
LearningRate = 1.0,
MaxGradNorm = 5.0,
NumLayers = 2,
NumSteps = numSteps,
HiddenSize = TestHiddenSize > 0 ? TestHiddenSize : 200,
MaxEpoch = 4,
MaxMaxEpoch = TestMaxMaxEpoch > 0 ? TestMaxMaxEpoch : 13,
KeepProb = keepProb,
LrDecay = 0.5,
BatchSize = batchSize,
VocabSize = 10000
};
}



public static Config Medium(int batchSize = 20, int numSteps = 35, double keepProb = 0.5)
{

return new Config
{

InitScale = 0.05,
LearningRate = 1.0,
MaxGradNorm = 5.0,
NumLayers = 2,
NumSteps = numSteps,
HiddenSize = TestHiddenSize > 0 ? TestHiddenSize : 650,
MaxEpoch = 6,
MaxMaxEpoch = TestMaxMaxEpoch > 0 ? TestMaxMaxEpoch : 39,
KeepProb = keepProb,
LrDecay = 0.8,
BatchSize = batchSize,
VocabSize = 10000
};
}



public static Config Large(int batchSize = 20, int numSteps = 35, double keepProb = 0.35)
{

return new Config
{

InitScale = 0.04,
LearningRate = 1.0,
MaxGradNorm = 10.0,
NumLayers = 2,
NumSteps = numSteps,
HiddenSize = TestHiddenSize > 0 ? TestHiddenSize : 1500,
MaxEpoch = 14,
MaxMaxEpoch = TestMaxMaxEpoch > 0 ? TestMaxMaxEpoch : 55,
KeepProb = keepProb,
LrDecay = 1.0 / 1.15,
BatchSize = batchSize,
VocabSize = 10000
};
}
}



public class Model
{



#region Fields:



public enum ConfigType
{

Small = 0,
Medium,
Large
}



public const string DataPath = @"Data\PTB\simple-examples\data";



public const bool Profiling = false;



public const int TestMaxMaxEpoch = Profiling ? 1 : -1;



public const int TestHiddenSize = -1;



public const ConfigType CfgType = ConfigType.Small;  // ConfigType.Small, ConfigType.Large



#endregion



#region Properties:



public Config Config { get; }



public bool IsTraining { get; }



public bool UsingCuDnn { get; }



public Variable<int> Inputs { get; }



public Variable<int> Targets { get; }



public Embedding<float> Embedding { get; }



public Variable<float> EmbeddedOutput { get; }



public Lstm<float>[] RnnDirect { get; }



public Rnn<float> RnnAccelerated { get; }



public Variable<float> RnnOutput { get; }



public FullyConnected<float> FC { get; }



public SoftmaxCrossEntropySparse<float> Loss { get; }



public GradientDescentOptimizer Optimizer { get; }



#endregion



public Model(Context ctx, Config cfg, bool isTraining = true, bool usingCuDnn = true)
{

Config = cfg;
IsTraining = isTraining;
UsingCuDnn = usingCuDnn;

Inputs = Variable<int>(PartialShape.Create(cfg.NumSteps, cfg.BatchSize));
Targets = Variable<int>(PartialShape.Create(cfg.NumSteps, cfg.BatchSize));

// embedding
Embedding = new Embedding<float>(Inputs, cfg.VocabSize, cfg.HiddenSize, initScale: cfg.InitScale);

// add dropout
EmbeddedOutput = Embedding.Output;
if (isTraining && cfg.KeepProb < 1.0)
{
var dropout = new Dropout<float>(EmbeddedOutput, dropoutProb: 1.0 - cfg.KeepProb);
EmbeddedOutput = dropout.Output;
}

// rnn layer, dropout for intermediate lstm layers and for output
if (usingCuDnn)
{
RnnAccelerated = new Rnn<float>(new LstmRnnType(forgetBiasInit: 0.0), EmbeddedOutput, cfg.NumLayers, cfg.HiddenSize, isTraining: isTraining, dropout: isTraining && cfg.KeepProb < 1.0 ? 1.0 - Config.KeepProb : 0.0);
RnnOutput = RnnAccelerated.Y;
if (isTraining && cfg.KeepProb < 1.0)
{
var dropout = new Dropout<float>(RnnOutput, dropoutProb: 1.0 - cfg.KeepProb);
RnnOutput = dropout.Output;
}
}
else
{
RnnDirect = new Lstm<float>[cfg.NumLayers];
for (var i = 0; i < cfg.NumLayers; ++i)
{
var lstm = new Lstm<float>(i == 0 ? EmbeddedOutput : RnnOutput, cfg.HiddenSize, forgetBiasInit: 0.0);
RnnDirect[i] = lstm;
RnnOutput = lstm.Y;
if (isTraining && cfg.KeepProb < 1.0)
{
var dropout = new Dropout<float>(RnnOutput, dropoutProb: 1.0 - cfg.KeepProb);
RnnOutput = dropout.Output;
}
}
}

FC = new FullyConnected<float>(RnnOutput.Reshape(RnnOutput.Shape[0] * RnnOutput.Shape[1], RnnOutput.Shape[2]), cfg.VocabSize);

Loss = new SoftmaxCrossEntropySparse<float>(FC.Output, Targets.Reshape(Targets.Shape[0] * Targets.Shape[1]));

Optimizer = new GradientDescentOptimizer(ctx, Loss.Loss, cfg.LearningRate, new GlobalNormGradientClipper(cfg.MaxGradNorm));

// warmup to force JIT compilation to get timings without JIT overhead
Optimizer.Initalize();

ResetStates();

Optimizer.AssignTensor(Inputs, Fill(Shape.Create(Inputs.Shape.AsArray), 0));
Optimizer.AssignTensor(Targets, Fill(Shape.Create(Targets.Shape.AsArray), 0));

Optimizer.Forward();

if (isTraining)
{
Optimizer.Backward();
}

// now reset states
Optimizer.Initalize();
ResetStates();
}



public void CopyWeightsFrom(Model o)
{

Optimizer.AssignTensor(Embedding.Weights, o.Optimizer.GetTensor(o.Embedding.Weights));
Optimizer.AssignTensor(FC.Weights, o.Optimizer.GetTensor(o.FC.Weights));
Optimizer.AssignTensor(FC.Bias, o.Optimizer.GetTensor(o.FC.Bias));
if (UsingCuDnn)
{
Util.EnsureTrue(o.UsingCuDnn);
Optimizer.AssignTensor(RnnAccelerated.W, o.Optimizer.GetTensor(o.RnnAccelerated.W));
}
else
{
Util.EnsureTrue(!o.UsingCuDnn);
for (var i = 0; i < Config.NumLayers; ++i)
{
Optimizer.AssignTensor(RnnDirect[i].W, o.Optimizer.GetTensor(o.RnnDirect[i].W));
}
}
}



public void ResetStates()
{

if (UsingCuDnn)
{
Optimizer.AssignTensor(RnnAccelerated.CX, Fill(Shape.Create(RnnAccelerated.CX.Shape.AsArray), 0.0f));
Optimizer.AssignTensor(RnnAccelerated.HX, Fill(Shape.Create(RnnAccelerated.HX.Shape.AsArray), 0.0f));
}
else
{

for (var i = 0; i < Config.NumLayers; ++i)
{
var lstm = RnnDirect[i];
var shape = Shape.Create(Config.BatchSize, lstm.HiddenSize);
Optimizer.AssignTensor(lstm.CX, Fill(shape, 0.0f));
Optimizer.AssignTensor(lstm.HX, Fill(shape, 0.0f));
}
}
}



public void CopyStates()
{

if (UsingCuDnn)
{
Optimizer.AssignTensor(RnnAccelerated.CX, Optimizer.GetTensor(RnnAccelerated.CY));
Optimizer.AssignTensor(RnnAccelerated.HX, Optimizer.GetTensor(RnnAccelerated.HY));
}
else
{
for (var i = 0; i < Config.NumLayers; ++i)
{
var lstm = RnnDirect[i];
Optimizer.AssignTensor(lstm.CX, Optimizer.GetTensor(lstm.CY));
Optimizer.AssignTensor(lstm.HX, Optimizer.GetTensor(lstm.HY));
}
}
}



public double RunEpoch(int[] data, double learningRate = 1.0, bool verbose = false)
{

var cfg = Config;
var isTraining = IsTraining;
var epochSize = (data.Length / cfg.BatchSize - 1) / cfg.NumSteps;
var time = Stopwatch.StartNew();
var costs = 0.0;
var iters = 0;
var step = 0;
var firstBatch = true;

foreach (var batch in Data.Iterator(data, cfg.NumSteps, cfg.BatchSize))
{
Optimizer.AssignTensor(Inputs, batch.Inputs.AsTensor());
Optimizer.AssignTensor(Targets, batch.Targets.AsTensor());

if (firstBatch)
{
ResetStates();
firstBatch = false;
}
else
{
CopyStates();
}

Optimizer.Forward();

if (isTraining)
{
Optimizer.Backward();
Optimizer.Optimize(learningRate);
}

var loss = Optimizer.GetTensor(Loss.Loss).ToScalar();
var cost = loss / cfg.BatchSize;
costs += cost;
iters += cfg.NumSteps;

if (Profiling || (verbose && (step % (epochSize / 10) == 10)))
{
var perplexity = Math.Exp(costs / iters);
var wps = (iters * cfg.BatchSize) / (time.Elapsed.TotalMilliseconds / 1000.0);

Console.WriteLine($"{step:D4}: {step * 1.0 / epochSize:F3} perplexity: {perplexity:F3} speed:{wps:F0} wps cost: {cost:F3}");
}

if (Profiling && step > 5) break;

step++;
}

return Math.Exp(costs / iters);
}
}
}

The LSTM Class: Here

<?xml version="1.0" encoding="utf-8"?>
<configuration>
<configSections>
<section name="aleaSettings" type="Alea.Settings, Alea" />
</configSections>
<aleaSettings>
<cuBLAS version="9.0" />
<cuRAND version="9.0" />
</aleaSettings>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
</startup>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
  <dependentAssembly>
    <assemblyIdentity name="FSharp.Core" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
    <bindingRedirect oldVersion="0.0.0.0-4.7.0.0" newVersion="4.7.0.0" />
  </dependentAssembly>
</assemblyBinding>
</runtime>
</configuration>

The App.config file might have been enough to get it working.

Rusty Nail
  • 2,692
  • 3
  • 34
  • 55
  • @mjwills - thank you for your patience! I have rewritten quickly, into a console app. You will need the dll's 'cudnn64_5' and 'curand64_75' - I hope this example is better now! – Rusty Nail Aug 14 '19 at 00:10

1 Answers1

0

After quite some debugging, I believe I have found the problem.

<?xml version="1.0" encoding="utf-8"?>
<configuration>
<configSections>
<section name="aleaSettings" type="Alea.Settings, Alea" />
</configSections>
<aleaSettings>
<cuBLAS version="9.0" />
<cuRAND version="9.0" />
</aleaSettings>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.2" />
</startup>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
<dependentAssembly>
<assemblyIdentity name="FSharp.Core" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-4.7.0.0" newVersion="4.7.0.0" />
</dependentAssembly>
</assemblyBinding>
</runtime>
</configuration>

Where Cuda 9.0 is what I have installed currently.

Also, you need to set your projects to x64:

enter image description here enter image description here

Make sure you install FSharp.Core - It wont work without it.

You need to copy:

  • cudnn64_5
  • curand64_75

To the bin\x64\Debug folder.

But, over all, what fixed the throw new NotImplementedException() Exception, was the code in the App.Config file to tell Alea where to find CuBlas.

From Here:

if (!Execute(assignment, lValue)) throw new NotImplementedException();

in the

public override void Execute(Assignment assignment)

in the

public abstract class LExpr<T> : Expr<T>

which calls this when executed:

protected override bool Execute(Assignment assignment, ILValue<T> output)
{
var a = assignment.GetInput(A).ToLValue();
var b = assignment.GetInput(B).ToLValue();

var aRows = a.Layout.Shape[0];
var aCols = a.Layout.Shape[1];
var bRows = b.Layout.Shape[0];
var bCols = b.Layout.Shape[1];

if (assignment.Context.Type == ContextType.Gpu
&& Alea.cuBLAS.Blas.IsAvailable
&& a.Layout.IsFullyPacked
&& b.Layout.IsFullyPacked
&& output.Layout.IsInnerChangeMostFullyPacked)
{

var context = assignment.Context.ToGpuContext();
var blas = context.Blas;

var aPtr = a.Buffer.Ptr;
var bPtr = b.Buffer.Ptr;
var cPtr = output.Buffer.Ptr;

var m = (int)bCols;
var n = (int)aRows;
var k = (int)aCols;
var lda = a.Layout.IsInnerChangeMost ? aCols : aRows;
var ldb = b.Layout.IsInnerChangeMost ? bCols : bRows;
var ldc = output.Layout.Shape[1];

var opa = a.Layout.IsInnerChangeMost ? Alea.cuBLAS.Operation.N : Alea.cuBLAS.Operation.T;
var opb = b.Layout.IsInnerChangeMost ? Alea.cuBLAS.Operation.N : Alea.cuBLAS.Operation.T;

if (typeof(T) == typeof(double))
{
blas.Gemm(opb, opa, m, n, k, 1.0, bPtr.Reinterpret<double>(), (int)ldb, aPtr.Reinterpret<double>(), (int)lda,
0.0, cPtr.Reinterpret<double>(), (int)ldc);
return true;
}

if (typeof(T) == typeof(float))
{
blas.Gemm(opb, opa, m, n, k, 1.0f, bPtr.Reinterpret<float>(), (int)ldb, aPtr.Reinterpret<float>(), (int)lda,
0.0f, cPtr.Reinterpret<float>(), (int)ldc);
return true;
}
}

var readA = a.Buffer.GetReader2();
var readB = b.Buffer.GetReader2();
var writeC = output.Buffer.Writer2;
var zero = Zero;
var add = Add;
var mul = Mul;

//if (assignment.AttentionState.Type == ContextType.Gpu)
//{
//    Func<long, long, T> getA = (row, col) => row < aRows && col < aCols ? readA(row, col) : zero;
//    Func<long, long, T> getB = (row, col) => row < bRows && col < bCols ? readB(row, col) : zero;
//    Action<long, long, T> setC = (row, col, value) =>
//    {
//        if (row < aRows && col < bCols) writeC(row, col, value);
//    };
//    var blockSize = new dim3(BlockSize, BlockSize);
//    var gridSize = new dim3((int)ScalarOps.DivUp(aRows, BlockSize), (int)ScalarOps.DivUp(bCols, BlockSize));
//    var lp = new LaunchParam(gridSize, blockSize);
//    var stream = assignment.AttentionState.ToGpuContext().Stream;
//    stream.Launch(Kernel, lp, aCols, getA, getB, setC, zero, add, mul);
//    return true;
//}

if (assignment.Context.Type == ContextType.Cpu)
{
for (var i = 0L; i < aRows; ++i)
{
for (var j = 0L; j < bCols; ++j)
{
var acc = zero;
for (var k = 0L; k < aCols; ++k)
{
acc = add(acc, mul(readA(i, k), readB(k, j)));
}
writeC(i, j, acc);
}
}
return true;
}

return false;
}

Which returns false, because:

Alea.cuBLAS.Blas.IsAvailable = false

when it should true.

Crazy! I hope this helps others!

Rusty Nail
  • 2,692
  • 3
  • 34
  • 55