0

I want to implement MonteCarlo using CUDA.

I write my code on Win8 PC using Visual Studio2012/CUDA 5.5/GT 720M and it runs well.

Then I tried to compile my code in REHL5.3/Tesla C1060/CUDA 2.3 but the result turned out wrong.

Then I want to use cuda-gdb to debug it

but, when I compile my code like this:

nvcc -arch=sm_13 -o my_program my_program.cu

the result is wrong. However I can't debug it because it's not debug-able code.

When I compile it like this:

nvcc -g -G -arch=sm_13 -o my_program my_program.cu

The result, this time, get correct... So still I can't find my bug by debugging it...

the code looks like this, the function __device__ double monte_carlo_try() is not in the real code. the problem is, if I check the value of test[], I find the values are all correct. So there should be some error in the reduction part.

#include<stdio.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cuda.h>
#include<malloc.h>
#include<time.h>

#define B 4 //block number
#define T 4 //number of threads per block
#define P 4 //number of paths per thread
__device__ float monte_carlo_try()
{
    return 3.0;
}
__global__ void monte_carlo(float*test,    float*result)
{
    int bid=blockIdx.x;
    int tid=threadIdx.x + blockIdx.x * blockDim.x;
    int idx=threadIdx.x;

    __shared__ float cache[T];
    cache[idx]=0;



    float temp=0;
    for(int i=0;i<P;i++)
    {
        temp+=monte_carlo_try();    //monte_carlo_try: __device__ function do monte carlo test
    }


    cache[idx]=temp;
    test[tid]=cache[idx];
    __syncthreads();
    //result[] is the output, and I use test[] to check whether I have got the right cache[idx]
    //and the value of test[] is same with what I expect

    int i=blockDim.x/2;
    while(i>0)
    {
        if(idx<i)
            cache[idx]+=cache[idx+i];
        __syncthreads();
        i/=2;
    }
    result[bid]=cache[0];
}

int main()
{
    void check_err(cudaError_t );


    cudaSetDevice(0);
    cudaError_t s_flag;
    float *dev_v;
    float *dev_test;

    s_flag=cudaMalloc((void**)&dev_v,B*sizeof(float));
    check_err(s_flag);


    cudaMalloc((void**)&dev_test,B*T*sizeof(float));
    check_err(s_flag);

    monte_carlo<<<B,T>>>(dev_test,dev_v);
    s_flag=cudaGetLastError();
    check_err(s_flag);

    float v[B];
    float test[B*T];
    s_flag=cudaMemcpy(v,dev_v,B*sizeof(float),cudaMemcpyDeviceToHost);
    check_err(s_flag);

    s_flag=cudaMemcpy(test,dev_test,B*T*sizeof(float),cudaMemcpyDeviceToHost);
    check_err(s_flag);

    float sum=0;
    for(int i=0;i<B;i++)
    {
        sum+=v[i];
    }
    printf("result:%f\n",sum/(B*T*P));
    for(int i=0;i<B*T;i++)
    {
        printf("test[%d]=%f\n",i,test[i]);
    }
    cudaFree(dev_v);
    cudaFree(dev_test);
    return 0;
}
void check_err(cudaError_t f)
{
    if(f != cudaSuccess)
        printf("error msg:%s\n",cudaGetErrorString(f));
}
Alaya
  • 3,287
  • 4
  • 27
  • 39
  • This code will not even compile. If you want help debugging, post the shortest possible code that someone else could compile and run and explain exactly how the output is incorrect. Otherwise it will be impossible to answer your question. Vote to close – talonmies Apr 17 '14 at 04:19
  • You keep saying that the results are "wrong" without any explanation of what that means. How are they wrong? What is the answer you expect? – talonmies Apr 17 '14 at 08:52
  • The sum of test[] and the sum of v[] should be equal as the code shows, but if I compile the code with `nvcc -arch=sm_13 -o my_prog my_prog.cu`, they are not equal, values in test[] are correct, while compiled with `nvcc -arch=sm_13 -g -G -o my_prog my_prog.cu` they are equal and correct. – Alaya Apr 17 '14 at 09:09
  • 1
    Your code prints out the sum of `v[]` divided by the total number of operations and each entry of `test[]`. When I compile and run in release mode it I get result=3 and 12 for each entry in `test[]`, which is correct. Do you get something different? – talonmies Apr 17 '14 at 09:20

1 Answers1

1

You probably mean for this line in main():

cudaMalloc((void**)*dev_test,B*T*sizeof(float));

to read like this instead:

cudaMalloc((void**)&dev_test,B*T*sizeof(float));

Additionally, you call

monte_carlo(dev_test,dev_v);

Since monte_carlo is a CUDA kernel, you probably should be setting the number of blocks and threads the kernel should launch with:

monte_carlo<<<num_blocks, threads_per_block>>>(dev_test, dev_v);
cklin
  • 900
  • 4
  • 16
  • If that error is actually in your code you probably will get segfaults or at least undefined behavior. – cklin Apr 17 '14 at 03:57
  • No, the two errors you mentioned above are not in my code...all typing errors... – Alaya Apr 17 '14 at 04:57
  • 2
    Then perhaps just cut and paste your code instead of typing. It is very hard to help otherwise. Also, please make a SSCCE -- e.g., `monte_carlo_try` isn't defined anywhere in what you show. – cklin Apr 17 '14 at 05:05
  • I have updated the description of the problem , the code now should be OK to be compiled – Alaya Apr 17 '14 at 05:34