I'm new to CUDA. To get my hands dirty, I tried writing a Sieve of Eratosthenes (for finding all the primes up to some number n).
There are a number of things I had to do to get it to work that it seems shouldn't have been necessary. I'm curious whether anyone knows of a more natural (and still CUDA-optimized) approach.
- To take the entries marked as prime in the isPrime array, I had to do two separate kernel calls. The first counts the number of primes in each threadblock and assigns to each entry i the number of primes in that block less than i. Then I have to make a second call to add in the number of primes in all the previous blocks in order to get the final index.
- But it's even worse than that, because to avoid heaps of concurrent reads, I had to store the number of primes in the block in a separate array at each of THREADS_PER_BLOCK indices effectively doubling the required memory for the algorithm. It seems like there should be a way to have all the threads read the same value for each block rather than have to copy it so many times.
- Despite all this, there's still the problem of concurrent reads in the clearMultiples method. Especially for small primes like 2 and 3, every thread has to read the value in. Isn't there any way to deal with this?
Could anyone look at my code and tell me if there's anything obvious I could do that would be simpler or more efficient?
Is there anything I'm doing that's particularly inefficient (besides printing out all the primes at the end of course)?
Is it necessary to call synchronize after every kernel call?
Do I need to synchronize after memcpy's as well?
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Thank you
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
#include <math.h>
#define MAX_BLOCKS 256
#define THREADS_PER_BLOCK 256 //Must be a power of 2
#define BLOCK_SPACE 2 * THREADS_PER_BLOCK
__global__ void initialize(int* isPrime, int n) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
int i;
for (i = idx; i <= 1; i += step) {
isPrime[i] = 0;
}
for (; i < n; i += step) {
isPrime[i] = 1;
}
}
__global__ void clearMultiples(int* isPrime, int* primeList, int startInd,
int endInd, int n) {
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int ystep = gridDim.y * blockDim.y;
int xstep = gridDim.x * blockDim.x;
for (int pnum = startInd + yidx; pnum < endInd; pnum += ystep) {
int p = primeList[pnum];
int pstart = p * (p + xidx);
int pstep = p * xstep;
for (int i = pstart; i < n; i += pstep) {
isPrime[i] = 0;
}
}
}
__device__ void makeCounts(int* isPrime, int* addend, int start, int stop) {
__shared__ int tmpCounts[BLOCK_SPACE];
__shared__ int dumbCounts[BLOCK_SPACE];
int idx = threadIdx.x;
tmpCounts[idx] = ((start + idx) < stop) ? isPrime[start + idx] : 0;
__syncthreads();
int numEntries = THREADS_PER_BLOCK;
int cstart = 0;
while (numEntries > 1) {
int prevStart = cstart;
cstart += numEntries;
numEntries /= 2;
if (idx < numEntries) {
int i1 = idx * 2 + prevStart;
tmpCounts[idx + cstart] = tmpCounts[i1] + tmpCounts[i1 + 1];
}
__syncthreads();
}
if (idx == 0) {
dumbCounts[cstart] = tmpCounts[cstart];
tmpCounts[cstart] = 0;
}
while (cstart > 0) {
int prevStart = cstart;
cstart -= numEntries * 2;
if (idx < numEntries) {
int v1 = tmpCounts[idx + prevStart];
int i1 = idx * 2 + cstart;
tmpCounts[i1 + 1] = tmpCounts[i1] + v1;
tmpCounts[i1] = v1;
dumbCounts[i1] = dumbCounts[i1 + 1] = dumbCounts[idx + prevStart];
}
numEntries *= 2;
__syncthreads();
}
if (start + idx < stop) {
isPrime[start + idx] = tmpCounts[idx];
addend[start + idx] = dumbCounts[idx];
}
}
__global__ void createCounts(int* isPrime, int* addend, int lb, int ub) {
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + blockIdx.x * THREADS_PER_BLOCK; i < ub; i += step) {
int start = i;
int stop = min(i + step, ub);
makeCounts(isPrime, addend, start, stop);
}
}
__global__ void sumCounts(int* isPrime, int* addend, int lb, int ub,
int* totalsum) {
int idx = blockIdx.x;
int s = 0;
for (int i = lb + idx; i < ub; i += THREADS_PER_BLOCK) {
isPrime[i] += s;
s += addend[i];
}
if (idx == 0) {
*totalsum = s;
}
}
__global__ void condensePrimes(int* isPrime, int* primeList, int lb, int ub,
int primeStartInd, int primeCount) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + idx; i < ub; i += step) {
int term = isPrime[i];
int nextTerm = i + 1 == ub ? primeCount : isPrime[i + 1];
if (term < nextTerm) {
primeList[primeStartInd + term] = i;
}
}
}
int main(void) {
printf("Enter upper bound:\n");
int n;
scanf("%d", &n);
int *isPrime, *addend, *numPrimes, *primeList;
cudaError_t t = cudaMalloc((void**) &isPrime, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&addend, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&numPrimes, sizeof(int));
assert(t == cudaSuccess);
int primeBound = 2 * n / log(n);
t = cudaMalloc(&primeList, primeBound * sizeof(int));
assert(t == cudaSuccess);
int numBlocks = min(MAX_BLOCKS,
(n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
initialize<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int bound = (int) ceil(sqrt(n));
int lb;
int ub = 2;
int primeStartInd = 0;
int primeEndInd = 0;
while (ub < n) {
if (primeEndInd > primeStartInd) {
int lowprime;
t = cudaMemcpy(&lowprime, primeList + primeStartInd, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
int numcols = n / lowprime;
int numrows = primeEndInd - primeStartInd;
int threadx = min(numcols, THREADS_PER_BLOCK);
int thready = min(numrows, THREADS_PER_BLOCK / threadx);
int blockx = min(numcols / threadx, MAX_BLOCKS);
int blocky = min(numrows / thready, MAX_BLOCKS / blockx);
dim3 gridsize(blockx, blocky);
dim3 blocksize(threadx, thready);
clearMultiples<<<gridsize, blocksize>>>(isPrime, primeList,
primeStartInd, primeEndInd, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
lb = ub;
ub *= 2;
if (lb >= bound) {
ub = n;
}
numBlocks = min(MAX_BLOCKS,
(ub - lb + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
createCounts<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, addend, lb, ub);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
sumCounts<<<THREADS_PER_BLOCK, 1>>>(isPrime, addend, lb, ub, numPrimes);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int primeCount;
t = cudaMemcpy(&primeCount, numPrimes, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
assert(primeCount > 0);
primeStartInd = primeEndInd;
primeEndInd += primeCount;
condensePrimes<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, primeList, lb,
ub, primeStartInd, primeCount);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
int finalprimes[primeEndInd];
t = cudaMemcpy(finalprimes, primeList, primeEndInd * sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
t = cudaFree(isPrime);
assert(t == cudaSuccess);
t = cudaFree(addend);
assert(t == cudaSuccess);
t = cudaFree(numPrimes);
assert(t == cudaSuccess);
t = cudaFree(primeList);
assert(t == cudaSuccess);
for (int i = 0; i < primeEndInd; i++) {
if (i % 16 == 0)
printf("\n");
else
printf(" ");
printf("%4d", finalprimes[i]);
}
printf("\n");
return 0;
}