-2

My problem is to find out the number of integer points in n dimensional sphere using CUDA. I dont understand what is wrong with the below code but it is giving 0 output all the time. CUDA compute capability is 2.0 and tool kit version is 3.10. Thanks for all the help.

__global__ void count_in(int pow_rad, int ndim,int *digit,int w,unsigned int *count,double radius)
{


long int i,j;
int rem,idx,sq,num;
int iy=blockDim.y * blockIdx.y + threadIdx.y;
int ix=blockDim.x * blockIdx.x + threadIdx.x;
int width=gridDim.x*blockDim.x;
int h=2*w+1;
i=iy*width+ix;
if(i>pow_rad) return;

    sq=0;
    idx=0;
    num=i;
    for(j=0;j<ndim;j++)
        {digit[j]=0;}
    while(num!=0)
    {
        rem=num%w;
        num/=w;
        digit[idx]=rem;
        idx++;
    }
    for(j=0;j<ndim;j++)
        {sq+=(digit[j]-h)*(digit[j]-h);}
    if(sq<(radius*radius))
        atomicInc(count,(unsigned int)1);
    __syncthreads();
}

int main(int argc, char* argv[]) 
{
const long ntrials = 5;
int i;
for (int n = 0; n < ntrials; ++n) {
    int *digit;
    unsigned int *count;
    std::cout<<n<<std::endl;
    int pow_rad;
    unsigned int num;
    // Select radius and number of dimensions at random
    const double r = drand48() * (RMAX - RMIN) + RMIN;
    const int   nd = lrand48() % (MAXDIM - 1) + 1;
    cudaMalloc((void**) &digit,sizeof(int)*nd);
    cudaMalloc((void**) &count,sizeof(unsigned int));
    cudaMemset(count,0,sizeof(unsigned int));
    int h=(int)floor(r);
    int w=2*h+1;
    std::cout << "###"<< r <<" "<< nd<< std::endl;
    for(i=1;i<=nd;i++)
        pow_rad*=w;
    int width=(int)sqrt(pow_rad);
    // Call your function
    dim3 dimBlock(32,32);
    dim3 dimGrid((width/32)+1,(width/32)+1);

count_in<<<dimGrid,dimBlock>>>(pow_rad, nd,digit,w,count,r);
    cudaMemcpy(&num,count,sizeof(unsigned int),cudaMemcpyDeviceToHost);
std::cout << "-->"<<num << std::endl;
}
}
einpoklum
  • 118,144
  • 57
  • 340
  • 684
NIDIA LAL
  • 23
  • 4
  • 2
    Maybe a litte more explanation on your code would be advised. Stackoverflow is a community [not a debugger](http://stackoverflow.com/help/on-topic). – Kryptos Jun 19 '15 at 08:02
  • Sorry about that. I am new to the community and didn't realise. What I am trying to do here is convert the number to base 2H+1 so that I can check if the point is inside the sphere. My sequential version of the program is giving results, but when I do this all I get is zero. I am new to CUDA as well. Thanks for the help – NIDIA LAL Jun 19 '15 at 08:57
  • no problem. My comment was just to help you improve your question ;) The more precise, the more detailed answers you will get. – Kryptos Jun 19 '15 at 09:07
  • 2
    The code you have posted is not complete and will not compile. It is very hard to diagnose problems when there isn't the *actual* code you are compiling and running in the question – talonmies Jun 19 '15 at 09:22
  • Also, are you in the habit of ignoring compiler warnings? Because I see at least one uninitialised variable being passed to your kernel as an argument – talonmies Jun 19 '15 at 09:34

1 Answers1

1

I didn't look at all of your code, but the lines

    atomicInc(count,(unsigned int)1);

seems to show a common misunderstanding of the atomicInc function. The second argument is not the amount to increment, but the modulus; when the global variable reaches that amount, it resets to zero. With the value you specified, each time the statement executes the variable count is reset to 0.

If you change atomicInc to atomicAdd, or if you change the modulus to something large enough that it will never be reached, it should work better.

ime
  • 126
  • 4
  • 1
    While this is true and a very valid point, I'm fairly certain that the `atomicInc` statement in the kernel never gets executed because the kernel blows up with out of bounds memory access before it is reached. – talonmies Jun 19 '15 at 20:50