To practice programming with CUDA in C++. I did an exercise which consists in displaying the prime numbers less than N. For each code I commented out the last display loop to compare only the calculation times.
The Makefile :
all: sum sum_cpu
nothing:
g++ -O3 -std=c++17 -o premier.exe premier.cpp -Wall
cpu:
g++ -O3 -std=c++17 -o cpu_premier.exe premier.cpp -Wall -fopenmp
gpu:
nvcc --compile --compiler-options -O3 -o gpu_premier.o gpu_premier.cu -gencode arch=compute_50,code=sm_50
nvcc --link --compiler-options -O3 -o gpu_premier.exe gpu_premier.o
clear:
rm *.exe *.o
Here is my code to parallelize with openMP which runs in 1,306s :
#include <math.h>
#include <iostream>
const int N = 2<<22;
bool * premiers;
bool est_premier(int nbr){
if ( nbr==1 || nbr == 0) return false;
else if (nbr == 2) return true;
else if (nbr % 2 == 0) return false;
else {
for (int i=3;i<=sqrt(nbr);++i){
if (nbr % i == 0){
return false;
}
}
}
return true;
}
int main(){
premiers = new bool[N+1];
# pragma omp parallel for
for (int i = 0;i<N;++i){
premiers[i] = est_premier(i);
}
/*
for (int i = 0;i<N;++i){
if (premiers[i])
std::cout<<i<<",";
} std::cout<<std::endl;
*/
delete[] premiers;
}
Here is the corresponding cuda code which runs in 1,613s:
#include <cuda.h>
#include <iostream>
const int N = 2<<22;
bool * premiers_cpu;
bool * premiers_gpu;
__device__
bool est_premier(int nbr){
if ( nbr==1 || nbr == 0) return false;
else if (nbr == 2) return true;
else if (nbr % 2 == 0) return false;
else {
for (int i=3;i * i <= nbr ;++i){
if (nbr % i == 0){
return false;
}
}
}
return true;
}
__global__ void kernel_premier(bool * premiers, int size){
int gtid = blockIdx.x * blockDim.x + threadIdx.x ;
while(gtid < size){
premiers[gtid] = est_premier(gtid);
gtid += blockDim.x * gridDim.x;
}
}
int main(){
bool * premiers_cpu = new bool[N];
dim3 block (256,1);
dim3 grid (2048,1,1);
cudaMalloc(( void **) &premiers_gpu, N * sizeof(bool));
cudaMemcpy(premiers_gpu,premiers_cpu,N * sizeof(bool),cudaMemcpyHostToDevice);
kernel_premier<<<grid,block>>>(premiers_gpu,N);
cudaMemcpy(premiers_cpu,premiers_gpu,N * sizeof(bool),cudaMemcpyDeviceToHost);
/*
for (int i = 0;i<N;++i){
if (premiers_cpu[i])
std::cout<<i<<",";
} std::cout<<std::endl;
*/
delete[] premiers_cpu;
cudaFree(premiers_gpu);
}
Intuitively, I thought that lowering the size of the grid and increasing the number of blocks per grid would make the program more efficient, but it's the opposite. Here my program in cuda is less efficient than my program with OpenMP how to explain it and how to fix it?