Gpu performance not as expected to be

Question

The code below performs modular multiplication of 2 256 bit numbers in mod(0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f). This is first executed on my CPU.

#include <stdio.h>

// g++ testing.c -O3


int main()
{

    __uint128_t n[2], m[2], m1[2], d[2], q[2], temp_1, temp[8], k1[2];
    __uint64_t looper;
    m1[1] = 0xffffffff1112efff; m1[1]<<=64; m1[1]+=0x0fffffffffffffff; m[1] = 0xffffffff1112efff; m[1]<<=64; m[1]+=0x0fffffffffffffff;


    for(looper = 0; looper < 0xfffffff; looper++){

    n[1] = m[1]>>64; n[0] = m[1]&0xffffffffffffffff; d[1] = m[0]>>64; d[0] = m[0]&0xffffffffffffffff; q[1] = m1[1]>>64; q[0] = m1[1]&0xffffffffffffffff; k1[1] = m1[0]>>64; k1[0] = m1[0]&0xffffffffffffffff; 

temp_1 = d[0]*k1[0]; temp[0] = temp_1&0xffffffffffffffff; temp[1] = temp_1>>64; temp_1 = d[1]*k1[0]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] = temp_1>>64;
temp_1 = n[0]*k1[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] = temp_1>>64; temp_1 = n[1]*k1[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] = temp_1>>64; 

temp_1 = d[0]*k1[1]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] += (temp_1>>64); temp_1 = d[1]*k1[1]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64);
temp_1 = n[0]*k1[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = n[1]*k1[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] = temp_1>>64; 

temp_1 = d[0]*q[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64); temp_1 = d[1]*q[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64);
temp_1 = n[0]*q[0]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64); temp_1 = n[1]*q[0]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] = temp_1>>64; 

temp_1 = d[0]*q[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = d[1]*q[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64);
temp_1 = n[0]*q[1]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] += (temp_1>>64); temp_1 = n[1]*q[1]; temp[6] += (temp_1&0xffffffffffffffff); temp[7] = temp_1>>64; 


temp[2] += (temp[1]>>64); temp[3] += (temp[2]>>64); temp[4] += (temp[3]>>64); temp[5] += (temp[4]>>64); temp[6] += (temp[5]>>64); temp[7] += (temp[6]>>64);
temp[1] &= 0xffffffffffffffff; temp[2] &= 0xffffffffffffffff; temp[3] &= 0xffffffffffffffff; temp[4] &= 0xffffffffffffffff; temp[5] &= 0xffffffffffffffff; temp[6] &= 0xffffffffffffffff;
temp[4] *= 0x1000003d1; temp[5] *= 0x1000003d1; temp[5] += (temp[4]>>64); temp[4] &= 0xffffffffffffffff; temp[6] *= 0x1000003d1; temp[6] += (temp[5]>>64); temp[5] &= 0xffffffffffffffff;
temp[7] *= 0x1000003d1; temp[7] += (temp[6]>>64); temp[6] &= 0xffffffffffffffff; temp_1 = (temp[7]>>64)*0x1000003d1; temp[7] &= 0xffffffffffffffff;
temp[0] += temp[4] + temp_1; temp[1] += (temp[0]>>64); temp[0] &= 0xffffffffffffffff; temp[1] += temp[5]; temp[2] += (temp[1]>>64); temp[1] &= 0xffffffffffffffff;
temp[2] += temp[6]; temp[3] += (temp[2]>>64); temp[2] &= 0xffffffffffffffff; temp[3] += temp[7]; temp[0] += (temp[3]>>64)*0x1000003d1; temp[1] += (temp[0]>>64); temp[2] += (temp[1]>>64);
m[1] = (temp[3]<<64) + temp[2]; m[0] = (temp[1]<<64) + temp[0]; 
}

printf("%lx ", m[1]>>64);
    printf("%lx ", m[1]);
    printf("%lx ", m[0]>>64);
    printf("%lx\n", m[0]);



    return 0;
}

Note that the purpose of the looper variable is to execute this code many times to get a gauge of it's performance.
It's performance is as below.

g++ testing.c -O3
time ./a.out
d8523330f798a54 1c455152293ffb73 d49c580fc13a2ec0 c4600535d5dec485

real    0m5.630s
user    0m5.629s
sys 0m0.000s

Now, this same code is executed in CUDA environment.

#include <stdio.h>
    
// nvcc temp_cuda_testing.cu -O3



__global__ void testing() 
{
    __uint128_t n[2], m[2], m1[2], d[2], q[2], temp_1, temp[8], k1[2];
    __uint64_t looper;
    m1[1] = 0xffffffff1112efff; m1[1]<<=64; m1[1]+=0x0fffffffffffffff; m[1] = 0xffffffff1112efff; m[1]<<=64; m[1]+=0x0fffffffffffffff;


    for(looper = 0; looper < 0xfffffff; looper++){

    n[1] = m[1]>>64; n[0] = m[1]&0xffffffffffffffff; d[1] = m[0]>>64; d[0] = m[0]&0xffffffffffffffff; q[1] = m1[1]>>64; q[0] = m1[1]&0xffffffffffffffff; k1[1] = m1[0]>>64; k1[0] = m1[0]&0xffffffffffffffff; 

temp_1 = d[0]*k1[0]; temp[0] = temp_1&0xffffffffffffffff; temp[1] = temp_1>>64; temp_1 = d[1]*k1[0]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] = temp_1>>64;
temp_1 = n[0]*k1[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] = temp_1>>64; temp_1 = n[1]*k1[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] = temp_1>>64; 

temp_1 = d[0]*k1[1]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] += (temp_1>>64); temp_1 = d[1]*k1[1]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64);
temp_1 = n[0]*k1[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = n[1]*k1[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] = temp_1>>64; 

temp_1 = d[0]*q[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64); temp_1 = d[1]*q[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64);
temp_1 = n[0]*q[0]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64); temp_1 = n[1]*q[0]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] = temp_1>>64; 

temp_1 = d[0]*q[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = d[1]*q[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64);
temp_1 = n[0]*q[1]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] += (temp_1>>64); temp_1 = n[1]*q[1]; temp[6] += (temp_1&0xffffffffffffffff); temp[7] = temp_1>>64; 


temp[2] += (temp[1]>>64); temp[3] += (temp[2]>>64); temp[4] += (temp[3]>>64); temp[5] += (temp[4]>>64); temp[6] += (temp[5]>>64); temp[7] += (temp[6]>>64);
temp[1] &= 0xffffffffffffffff; temp[2] &= 0xffffffffffffffff; temp[3] &= 0xffffffffffffffff; temp[4] &= 0xffffffffffffffff; temp[5] &= 0xffffffffffffffff; temp[6] &= 0xffffffffffffffff;
temp[4] *= 0x1000003d1; temp[5] *= 0x1000003d1; temp[5] += (temp[4]>>64); temp[4] &= 0xffffffffffffffff; temp[6] *= 0x1000003d1; temp[6] += (temp[5]>>64); temp[5] &= 0xffffffffffffffff;
temp[7] *= 0x1000003d1; temp[7] += (temp[6]>>64); temp[6] &= 0xffffffffffffffff; temp_1 = (temp[7]>>64)*0x1000003d1; temp[7] &= 0xffffffffffffffff;
temp[0] += temp[4] + temp_1; temp[1] += (temp[0]>>64); temp[0] &= 0xffffffffffffffff; temp[1] += temp[5]; temp[2] += (temp[1]>>64); temp[1] &= 0xffffffffffffffff;
temp[2] += temp[6]; temp[3] += (temp[2]>>64); temp[2] &= 0xffffffffffffffff; temp[3] += temp[7]; temp[0] += (temp[3]>>64)*0x1000003d1; temp[1] += (temp[0]>>64); temp[2] += (temp[1]>>64);
m[1] = (temp[3]<<64) + temp[2]; m[0] = (temp[1]<<64) + temp[0]; 
}

printf("%lx ", m[1]>>64);
    printf("%lx ", m[1]);
    printf("%lx ", m[0]>>64);
    printf("%lx\n", m[0]);
}

int main() 
{
    int a=1, b=1;
    testing<<<a,b>>>();
    cudaDeviceSynchronize();
    
    return 0;
}

The performance of this code is as below.

nvcc temp_cuda_testing.cu -O3
time ./a.out
1c45b976433d1870 362f4270e1932374 3b8eff154c95d0 f1c66d89b4457958

real    1m1.902s
user    1m1.734s
sys 0m0.160s

Never mind the different outputs. It's because of printf performing differently which ill take care off later.
See I expected GPU's performance to be lower than that of CPU's. CPU's are way better at handling sequential tasks than GPU's. I know that speed of GPU can surpass CPU when multiple blocks and threads are used.
My questions are as follows.
1 - There is a speed difference of about 12x times here. Is this normal and expected?(And again, i do know that using multiple threads can overcome this speed difference. But i'm asking if the speed diff is normal for 1 CPU thread vs 1 GPU thread). I have not used any branching here for the mere reason that GPU's work slow when branching like IF are used.
2 - The code above is just a basic framework. I have a project(which i am not allowed to post here..sorry) on point addition on secp256k1 curve which uses the above code a lot. The speed difference there when i run it on CPU and GPU is of 32x times!! Each thread requires about 0.6KB of memory for it's variables. I do have a multiplicative inverse function which has two nested if's inside 2 nested while loops and perf analysis revealed that 98% percent of total time is spent executing this function. Why is the speed difference a lot when the same code is run on CPU? What might be the reason? Is the nested structure of the code the culprit? Is this information not enough? What else info is needed to make a judgement from your end? The code size is 20KB. Will that cause slowness?
3 - The execution time is increasing significantly when i change <<<28,128>>> to <<<28,256>>> and to <<<56,256>>> and so on. It remained the same till <<<28,128>>>. What might be the reason for this? I have a requirement to achieve maximum parallelism without compromise in execution time otherwise it's just same as sequential. What are the factors here that's causing this?
My hardware specs
CPU Processor - Intel® Core™ i7-9700K CPU @ 3.60GHz × 8
GPU Processor - NVIDIA Corporation TU117 [GeForce GTX 1650]

You have written code which exposes no parallelism. It is completely serial. When you run more threads, you are just running more instances of the same serial code doing the same thing. To expect that anything in your question should result in improvement in performance is, frankly, nonsensical — talonmies, Aug 08 '23 at 00:16
@talonmies this is just an example to compare speeds. I have already told 2 times in question that I know that I can divide the total iterations parallel wise and get a net speed which is greater than cpu — Knm, Aug 08 '23 at 00:39
@talonmies im asking the question for 1 cpu thread vs one gpu thread.. Is the speed difference that drastic? — Knm, Aug 08 '23 at 00:40
@talonmies and also, please check out questions 1, 2 and 3. Any insight is appreciated — Knm, Aug 08 '23 at 00:41
Gpus are optimized for large streams of embarrassingly parallel floating point operations, in single or half precision. You have shown a serial code using 128 bit integers and are wondering why it is slow.... — talonmies, Aug 08 '23 at 01:33
Further to that, the kernel code in your question doesn't modify global or shared memory, meaning that the compiler can and will optimize all the code away, leaving you with an empty stub which will run in a few microseconds -- https://godbolt.org/z/xE8T6ae79 . So either the code you have posted isn't the code you are benchmarking **or** you are compiling your code for debugging, meaning that all compiler optimizations are disabled, and all registers are spilled to local memory https://godbolt.org/z/cafaP9sbr . Either way, asking about performance with the information you posted makes no sense — talonmies, Aug 08 '23 at 02:36
@talonmies And i say it again and again.. I am not wondering why it is slow. I know that it is slower than cpu. One of my questions was that Since it's showing to be 12x times slower, is that expected? Because there is a huge difference between 2x times slower, 4x times slower, 8x times slower and so on — Knm, Aug 08 '23 at 09:47
@talonmies my code has the print statement which makes sure the code does run instead of an empty stub — Knm, Aug 08 '23 at 09:49
Whether it's 12 times slower or 24 times doesn't really matter. Your GPU is designed to run 14*1024 = 14336 parallel, independent computations at the same time (number of SM times max. resident threads). The moment you start using it, all the latency and individually slow performance will be hidden by massively parallel execution. If it's then still slower than the CPU doing a similar number of computations, we can look at the details — Homer512, Aug 08 '23 at 09:57
As for question 2: That sounds like you make use of local memory that cannot be held in registers. As the [best practices guide writes](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#local-memory), this has long latencies and performs similarly to global memory. A CPU might have a relative advantage here with is stronger L1+L2 cache. But this needs benchmarking; the GPU may still come out ahead overall. Again, this depends on many threads being in-flight to hide the latency. — Homer512, Aug 08 '23 at 10:07
@Homer512 yeah that does make sense about the local memory. I made sure each block uses < 65535 registers which is the limit for each block. Each SM is loaded with only one block since the limit of total registers per SM is also 65535. The only thing i can guess is that my program uses __uint128_t which is not standard as my GPU uses 64 bit processors. that is a feature of gcc compiler. So i imagine gcc is putting those values in local memory — Knm, Aug 08 '23 at 22:24
@Homer512 compiling with --ptxas-options=-v gave me Used 128 registers, 328 bytes cmem[0], 88 bytes cmem[2]. Does this mean that i am using registers instead of local memory? Because the total size of variables in my project for each thread is around 108*32 bits which means 108 registers.65535/108 ~= 606 threads per block. That is why im running only 512 threads per block. But still the time is doubling from <<<1,1>>> to <<<14,512>>>. This i dont know why even though my calculations show that im well within the limits — Knm, Aug 08 '23 at 22:38
Try running it with Nvidia's visual profiler ([Nsight Systems](https://developer.nvidia.com/nsight-systems)). Should be part of the toolkit. It's very good at identifying bottlenecks — Homer512, Aug 09 '23 at 06:49

Gpu performance not as expected to be

0 Answers0