OpenCL slow CPU performance

Question

I have a huge amount of calculations that needs to be done. i am using C#. when using the c# native environment (which is normally slower than opencl) i am able to do about 1.25 million round per second but when using the opencl kernel it drops to about 885 thousand. here is the kernel code

__kernel void clac(__global uint8_t *header,__global uint8_t *toRet)
{
    uint8_t tempHdr[80];
    uint8_t tempDigest[32]={0};
    uint startNon=toRet[0] + (toRet[1] << 8) + (toRet[2] << 16) + (toRet[3] << 24);
    uint maxNon=toRet[4] + (toRet[5] << 8) + (toRet[6] << 16) + (toRet[7] << 24);
    uint nonce =startNon;
    uint32_t finalNon=0;
    uint8_t match=0;
    for(int x=0;x<80;x++)
        tempHdr[x]=header[x];
    
    tempHdr[76] = (char)(nonce);
    tempHdr[77] = (char)(nonce >> 8);
    tempHdr[78] = (char)(nonce >> 16);
    tempHdr[79] = (char)(nonce >> 24);
    while(finalNon<1)
    {
        ctx p;
        ctx_Init(&p);
        ctx_Update1(&p, tempHdr, 80);
        ctx_Final1(&p, tempDigest);
        
        ctx p1;
        ctx_Init(&p1);
        ctx_Update1(&p1, tempDigest, 32);
        ctx_Final1(&p1, tempDigest);
        for(int x=31;x>21;x--)
        {
         if(tempDigest[x]<1) match++;
        }
        if(match>8) 
        {
            finalNon=nonce;
            toRet[8] = (char)(nonce);
            toRet[9] = (char)(nonce >> 8);
            toRet[10] = (char)(nonce >> 16);
            toRet[11] = (char)(nonce >> 24);
        }
        else
        {
            nonce++;
            tempHdr[76] = (char)(nonce);
            tempHdr[77] = (char)(nonce >> 8);
            tempHdr[78] = (char)(nonce >> 16);
            tempHdr[79] = (char)(nonce >> 24);
        }
        match=0;
        
        if(nonce>maxNon) break;
        if(nonce<=startNon) break;
    }
}

CPU is Xeon E5 1607
the opencl library is Cloo
Windows 10 64x

CPu Average results

Integer Math             14,413 MOps/Sec
Floating Point Math      10,940 MOps/Sec
Find Prime Numbers       40 Million Primes/Sec
Random String Sorting    10 Thousand Strings/Sec
Data Encryption          1,168 MBytes/Sec
Data Compression         67.5 MBytes/Sec
Physics                  497 Frames/Sec
Extended Instructions    5,726 Million Matrices/Sec
Single Thread            1,814 MOps/Sec

which means it should make about 20-30 million round per second

any suggestions would be appreciated

and here is the c# code as requested

  var userDataPtr = System.Runtime.InteropServices.Marshal.AllocCoTaskMem(512);
device = ComputePlatform.Platforms[0].QueryDevices()[0];
    
var context = new ComputeContext(new[] { device }, new ComputeContextPropertyList(device.Platform), null, userDataPtr);
var queue = new ComputeCommandQueue(context, device, ComputeCommandQueueFlags.None);
    
var finalResult = new byte[16];
    
Buffer.BlockCopy(BitConverter.GetBytes(startNon), 0, finalResult, 0, 4);
Buffer.BlockCopy(BitConverter.GetBytes(maxNon), 0, finalResult, 4, 4);
var clBuffer0 = new ComputeBuffer<byte>(context, ComputeMemoryFlags.ReadOnly, myHeader.Length);
var clBuffer2 = new ComputeBuffer<byte>(context, ComputeMemoryFlags.ReadWrite, finalResult.Length);

queue.WriteToBuffer(myHeader, clBuffer0, true, null);
queue.WriteToBuffer(finalResult, clBuffer2, true, null);
int len = myHeader.Length;
var prog = new ComputeProgram(context, kernelcode);
try
  {
    prog.Build(new[] { device }, " -D WORKSIZE=1", null, userDataPtr);
  }
catch (ComputeException)
  {
    AddLog(prog.GetBuildLog(device));
    return;
  }
var kernel = prog.CreateKernel("clac");
kernel.SetMemoryArgument(0, clBuffer0);
kernel.SetMemoryArgument(1, clBuffer2);
queue.Finish();
queue.Execute(kernel, null, new long[] { 1 }, new long[] { 1 }, null);
var outp = new byte[finalResult.Length];
queue.ReadFromBuffer(clBuffer2, ref outp, true, null);
kernel.Dispose();
kernel = null;

clBuffer0.Dispose();
clBuffer0 = null;
//rest of codes to dispose kernel and opencl variables then handle result

Please share your equivalent C# code. – Björn Lindqvist Aug 22 '22 at 16:59 — Björn Lindqvist, Aug 22 '22 at 16:59

OpenCL slow CPU performance

0 Answers0