I have a huge amount of calculations that needs to be done. i am using C#. when using the c# native environment (which is normally slower than opencl) i am able to do about 1.25 million round per second but when using the opencl kernel it drops to about 885 thousand. here is the kernel code
__kernel void clac(__global uint8_t *header,__global uint8_t *toRet)
{
uint8_t tempHdr[80];
uint8_t tempDigest[32]={0};
uint startNon=toRet[0] + (toRet[1] << 8) + (toRet[2] << 16) + (toRet[3] << 24);
uint maxNon=toRet[4] + (toRet[5] << 8) + (toRet[6] << 16) + (toRet[7] << 24);
uint nonce =startNon;
uint32_t finalNon=0;
uint8_t match=0;
for(int x=0;x<80;x++)
tempHdr[x]=header[x];
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
while(finalNon<1)
{
ctx p;
ctx_Init(&p);
ctx_Update1(&p, tempHdr, 80);
ctx_Final1(&p, tempDigest);
ctx p1;
ctx_Init(&p1);
ctx_Update1(&p1, tempDigest, 32);
ctx_Final1(&p1, tempDigest);
for(int x=31;x>21;x--)
{
if(tempDigest[x]<1) match++;
}
if(match>8)
{
finalNon=nonce;
toRet[8] = (char)(nonce);
toRet[9] = (char)(nonce >> 8);
toRet[10] = (char)(nonce >> 16);
toRet[11] = (char)(nonce >> 24);
}
else
{
nonce++;
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
}
match=0;
if(nonce>maxNon) break;
if(nonce<=startNon) break;
}
}
- CPU is Xeon E5 1607
- the opencl library is Cloo
- Windows 10 64x
CPu Average results
Integer Math 14,413 MOps/Sec
Floating Point Math 10,940 MOps/Sec
Find Prime Numbers 40 Million Primes/Sec
Random String Sorting 10 Thousand Strings/Sec
Data Encryption 1,168 MBytes/Sec
Data Compression 67.5 MBytes/Sec
Physics 497 Frames/Sec
Extended Instructions 5,726 Million Matrices/Sec
Single Thread 1,814 MOps/Sec
which means it should make about 20-30 million round per second
any suggestions would be appreciated
and here is the c# code as requested
var userDataPtr = System.Runtime.InteropServices.Marshal.AllocCoTaskMem(512);
device = ComputePlatform.Platforms[0].QueryDevices()[0];
var context = new ComputeContext(new[] { device }, new ComputeContextPropertyList(device.Platform), null, userDataPtr);
var queue = new ComputeCommandQueue(context, device, ComputeCommandQueueFlags.None);
var finalResult = new byte[16];
Buffer.BlockCopy(BitConverter.GetBytes(startNon), 0, finalResult, 0, 4);
Buffer.BlockCopy(BitConverter.GetBytes(maxNon), 0, finalResult, 4, 4);
var clBuffer0 = new ComputeBuffer<byte>(context, ComputeMemoryFlags.ReadOnly, myHeader.Length);
var clBuffer2 = new ComputeBuffer<byte>(context, ComputeMemoryFlags.ReadWrite, finalResult.Length);
queue.WriteToBuffer(myHeader, clBuffer0, true, null);
queue.WriteToBuffer(finalResult, clBuffer2, true, null);
int len = myHeader.Length;
var prog = new ComputeProgram(context, kernelcode);
try
{
prog.Build(new[] { device }, " -D WORKSIZE=1", null, userDataPtr);
}
catch (ComputeException)
{
AddLog(prog.GetBuildLog(device));
return;
}
var kernel = prog.CreateKernel("clac");
kernel.SetMemoryArgument(0, clBuffer0);
kernel.SetMemoryArgument(1, clBuffer2);
queue.Finish();
queue.Execute(kernel, null, new long[] { 1 }, new long[] { 1 }, null);
var outp = new byte[finalResult.Length];
queue.ReadFromBuffer(clBuffer2, ref outp, true, null);
kernel.Dispose();
kernel = null;
clBuffer0.Dispose();
clBuffer0 = null;
//rest of codes to dispose kernel and opencl variables then handle result