The code below performs modular multiplication of 2 256 bit numbers
in mod(0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f
). This is first executed on my CPU.
#include <stdio.h>
// g++ testing.c -O3
int main()
{
__uint128_t n[2], m[2], m1[2], d[2], q[2], temp_1, temp[8], k1[2];
__uint64_t looper;
m1[1] = 0xffffffff1112efff; m1[1]<<=64; m1[1]+=0x0fffffffffffffff; m[1] = 0xffffffff1112efff; m[1]<<=64; m[1]+=0x0fffffffffffffff;
for(looper = 0; looper < 0xfffffff; looper++){
n[1] = m[1]>>64; n[0] = m[1]&0xffffffffffffffff; d[1] = m[0]>>64; d[0] = m[0]&0xffffffffffffffff; q[1] = m1[1]>>64; q[0] = m1[1]&0xffffffffffffffff; k1[1] = m1[0]>>64; k1[0] = m1[0]&0xffffffffffffffff;
temp_1 = d[0]*k1[0]; temp[0] = temp_1&0xffffffffffffffff; temp[1] = temp_1>>64; temp_1 = d[1]*k1[0]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] = temp_1>>64;
temp_1 = n[0]*k1[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] = temp_1>>64; temp_1 = n[1]*k1[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] = temp_1>>64;
temp_1 = d[0]*k1[1]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] += (temp_1>>64); temp_1 = d[1]*k1[1]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64);
temp_1 = n[0]*k1[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = n[1]*k1[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] = temp_1>>64;
temp_1 = d[0]*q[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64); temp_1 = d[1]*q[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64);
temp_1 = n[0]*q[0]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64); temp_1 = n[1]*q[0]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] = temp_1>>64;
temp_1 = d[0]*q[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = d[1]*q[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64);
temp_1 = n[0]*q[1]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] += (temp_1>>64); temp_1 = n[1]*q[1]; temp[6] += (temp_1&0xffffffffffffffff); temp[7] = temp_1>>64;
temp[2] += (temp[1]>>64); temp[3] += (temp[2]>>64); temp[4] += (temp[3]>>64); temp[5] += (temp[4]>>64); temp[6] += (temp[5]>>64); temp[7] += (temp[6]>>64);
temp[1] &= 0xffffffffffffffff; temp[2] &= 0xffffffffffffffff; temp[3] &= 0xffffffffffffffff; temp[4] &= 0xffffffffffffffff; temp[5] &= 0xffffffffffffffff; temp[6] &= 0xffffffffffffffff;
temp[4] *= 0x1000003d1; temp[5] *= 0x1000003d1; temp[5] += (temp[4]>>64); temp[4] &= 0xffffffffffffffff; temp[6] *= 0x1000003d1; temp[6] += (temp[5]>>64); temp[5] &= 0xffffffffffffffff;
temp[7] *= 0x1000003d1; temp[7] += (temp[6]>>64); temp[6] &= 0xffffffffffffffff; temp_1 = (temp[7]>>64)*0x1000003d1; temp[7] &= 0xffffffffffffffff;
temp[0] += temp[4] + temp_1; temp[1] += (temp[0]>>64); temp[0] &= 0xffffffffffffffff; temp[1] += temp[5]; temp[2] += (temp[1]>>64); temp[1] &= 0xffffffffffffffff;
temp[2] += temp[6]; temp[3] += (temp[2]>>64); temp[2] &= 0xffffffffffffffff; temp[3] += temp[7]; temp[0] += (temp[3]>>64)*0x1000003d1; temp[1] += (temp[0]>>64); temp[2] += (temp[1]>>64);
m[1] = (temp[3]<<64) + temp[2]; m[0] = (temp[1]<<64) + temp[0];
}
printf("%lx ", m[1]>>64);
printf("%lx ", m[1]);
printf("%lx ", m[0]>>64);
printf("%lx\n", m[0]);
return 0;
}
Note that the purpose of the looper
variable is to execute this code many times to get a gauge of it's performance.
It's performance is as below.
g++ testing.c -O3
time ./a.out
d8523330f798a54 1c455152293ffb73 d49c580fc13a2ec0 c4600535d5dec485
real 0m5.630s
user 0m5.629s
sys 0m0.000s
Now, this same code is executed in CUDA environment.
#include <stdio.h>
// nvcc temp_cuda_testing.cu -O3
__global__ void testing()
{
__uint128_t n[2], m[2], m1[2], d[2], q[2], temp_1, temp[8], k1[2];
__uint64_t looper;
m1[1] = 0xffffffff1112efff; m1[1]<<=64; m1[1]+=0x0fffffffffffffff; m[1] = 0xffffffff1112efff; m[1]<<=64; m[1]+=0x0fffffffffffffff;
for(looper = 0; looper < 0xfffffff; looper++){
n[1] = m[1]>>64; n[0] = m[1]&0xffffffffffffffff; d[1] = m[0]>>64; d[0] = m[0]&0xffffffffffffffff; q[1] = m1[1]>>64; q[0] = m1[1]&0xffffffffffffffff; k1[1] = m1[0]>>64; k1[0] = m1[0]&0xffffffffffffffff;
temp_1 = d[0]*k1[0]; temp[0] = temp_1&0xffffffffffffffff; temp[1] = temp_1>>64; temp_1 = d[1]*k1[0]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] = temp_1>>64;
temp_1 = n[0]*k1[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] = temp_1>>64; temp_1 = n[1]*k1[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] = temp_1>>64;
temp_1 = d[0]*k1[1]; temp[1] += (temp_1&0xffffffffffffffff); temp[2] += (temp_1>>64); temp_1 = d[1]*k1[1]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64);
temp_1 = n[0]*k1[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = n[1]*k1[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] = temp_1>>64;
temp_1 = d[0]*q[0]; temp[2] += (temp_1&0xffffffffffffffff); temp[3] += (temp_1>>64); temp_1 = d[1]*q[0]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64);
temp_1 = n[0]*q[0]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64); temp_1 = n[1]*q[0]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] = temp_1>>64;
temp_1 = d[0]*q[1]; temp[3] += (temp_1&0xffffffffffffffff); temp[4] += (temp_1>>64); temp_1 = d[1]*q[1]; temp[4] += (temp_1&0xffffffffffffffff); temp[5] += (temp_1>>64);
temp_1 = n[0]*q[1]; temp[5] += (temp_1&0xffffffffffffffff); temp[6] += (temp_1>>64); temp_1 = n[1]*q[1]; temp[6] += (temp_1&0xffffffffffffffff); temp[7] = temp_1>>64;
temp[2] += (temp[1]>>64); temp[3] += (temp[2]>>64); temp[4] += (temp[3]>>64); temp[5] += (temp[4]>>64); temp[6] += (temp[5]>>64); temp[7] += (temp[6]>>64);
temp[1] &= 0xffffffffffffffff; temp[2] &= 0xffffffffffffffff; temp[3] &= 0xffffffffffffffff; temp[4] &= 0xffffffffffffffff; temp[5] &= 0xffffffffffffffff; temp[6] &= 0xffffffffffffffff;
temp[4] *= 0x1000003d1; temp[5] *= 0x1000003d1; temp[5] += (temp[4]>>64); temp[4] &= 0xffffffffffffffff; temp[6] *= 0x1000003d1; temp[6] += (temp[5]>>64); temp[5] &= 0xffffffffffffffff;
temp[7] *= 0x1000003d1; temp[7] += (temp[6]>>64); temp[6] &= 0xffffffffffffffff; temp_1 = (temp[7]>>64)*0x1000003d1; temp[7] &= 0xffffffffffffffff;
temp[0] += temp[4] + temp_1; temp[1] += (temp[0]>>64); temp[0] &= 0xffffffffffffffff; temp[1] += temp[5]; temp[2] += (temp[1]>>64); temp[1] &= 0xffffffffffffffff;
temp[2] += temp[6]; temp[3] += (temp[2]>>64); temp[2] &= 0xffffffffffffffff; temp[3] += temp[7]; temp[0] += (temp[3]>>64)*0x1000003d1; temp[1] += (temp[0]>>64); temp[2] += (temp[1]>>64);
m[1] = (temp[3]<<64) + temp[2]; m[0] = (temp[1]<<64) + temp[0];
}
printf("%lx ", m[1]>>64);
printf("%lx ", m[1]);
printf("%lx ", m[0]>>64);
printf("%lx\n", m[0]);
}
int main()
{
int a=1, b=1;
testing<<<a,b>>>();
cudaDeviceSynchronize();
return 0;
}
The performance of this code is as below.
nvcc temp_cuda_testing.cu -O3
time ./a.out
1c45b976433d1870 362f4270e1932374 3b8eff154c95d0 f1c66d89b4457958
real 1m1.902s
user 1m1.734s
sys 0m0.160s
Never mind the different outputs. It's because of printf performing differently which ill take care off later.
See I expected GPU's performance to be lower than that of CPU's. CPU's are way better at handling sequential tasks than GPU's. I know that speed of GPU can surpass CPU when multiple blocks and threads are used.
My questions are as follows.
1 - There is a speed difference of about 12x times here. Is this normal and expected
?(And again, i do know that using multiple threads can overcome this speed difference. But i'm asking if the speed diff is normal for 1 CPU thread vs 1 GPU thread). I have not used any branching here for the mere reason that GPU's work slow when branching like IF
are used.
2 - The code above is just a basic framework. I have a project(which i am not allowed to post here..sorry) on point addition on secp256k1 curve which uses the above code a lot. The speed difference there when i run it on CPU and GPU is of 32x times!! Each thread requires about 0.6KB of memory for it's variables. I do have a multiplicative inverse function which has two nested if's inside 2 nested while loops and perf analysis revealed that 98% percent of total time is spent executing this function. Why is the speed difference a lot when the same code is run on CPU? What might be the reason? Is the nested structure of the code the culprit? Is this information not enough? What else info is needed to make a judgement from your end?
The code size is 20KB. Will that cause slowness?
3 - The execution time is increasing significantly when i change <<<28,128>>> to <<<28,256>>> and to <<<56,256>>> and so on. It remained the same till <<<28,128>>>. What might be the reason for this? I have a requirement to achieve maximum parallelism without compromise in execution time otherwise it's just same as sequential. What are the factors here that's causing this?
My hardware specs
CPU Processor - Intel® Core™ i7-9700K CPU @ 3.60GHz × 8
GPU Processor - NVIDIA Corporation TU117 [GeForce GTX 1650]