3

I just started experimenting cuda with the following cude

#include "macro.hpp"
#include <algorithm>
#include <iostream>
#include <cstdlib>

//#define double float
//#define double int

int RandomNumber(){return static_cast<double>(rand() % 1000);}

__global__ void sum3(double const* a,
             double const* b,
             double const* c,
             double * result, 
             unsigned const* n)
{    
   unsigned i = blockIdx.x;
   while(i < (*n))
{
  result[i] = (a[i] + b[i] + c[i]);
}
};


int main()
{

  static unsigned size = 1e2;
  srand(0);
  double* a = new double[size];
  double* b = new double[size];
  double* c = new double[size];
  double* result = new double[size];

  std::generate(a, a+size, RandomNumber);
  std::generate(b, b+size, RandomNumber);
  std::generate(c, c+size, RandomNumber);

  double* ad, *bd,* cd;
  double* resultd;

  unsigned * sized;
  std::cout << cudaMalloc((void**) &ad, size*sizeof(double)) << std::endl;
  std::cout << cudaMalloc((void**) &bd, size*sizeof(double)) << std::endl;
  std::cout << cudaMalloc((void**) &cd, size*sizeof(double)) << std::endl;
  std::cout << cudaMalloc((void**) &resultd, size*sizeof(double)) << std::endl;
  std::cout << cudaMalloc((void**) &sized, sizeof(unsigned)) << std::endl;

  cudaMemcpy((void**) &sized, &size, sizeof(unsigned), cudaMemcpyHostToDevice);

  //  print_array(a, size);
  for(int i = 0; i < 1000; ++i)
    {
      cudaMemcpy(ad, a, size*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(bd, b, size*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(cd, c, size*sizeof(double), cudaMemcpyHostToDevice);      
      sum3<<<size, 1>>>(ad, bd, cd, resultd, sized);
      cudaMemcpy(result, resultd, size*sizeof(double), cudaMemcpyDeviceToHost);
    }

#ifdef PRINT
  for( int i = 0; i < size; ++i)
    {
      std::cout << a[i] << ", "<< b[i] <<"," << c[i] << "," << result[i]<< std::endl;
    }
#endif

  cudaFree(ad);
  cudaFree(bd);
  cudaFree(cd);
  cudaFree(resultd);

  delete[] a;
  delete[] b;
  delete[] c;
  delete[] result;

  return 0;
}

Compile this on mac book pro without any problem. However when I try to run this I get

930, 22,538,899
691, 832,205,23
415, 655,148,120
872, 876,481,985
761, 909,583,619
841, 104,466,917
610, 635,911,52
//more useless numbers

I have compared my samples with the one in Cuda By Example and I dont see any major difference except type. Any pointer on this problem is appreciated.

leon
  • 4,931
  • 7
  • 39
  • 37
  • What do you expect this line to do? std::generate(a, a+size, RandomNumber); It would expect that a+size is wrong, as adding an array pointer to a number is usually wrong, as you are going past the end of the array. – James Black Sep 04 '10 at 23:26
  • doesnt *(a+size) give the address space right after the element a[size-1]? – leon Sep 05 '10 at 02:41
  • @James: Going one past the array is fine and quite common. @leon: Your pointer arithmetic is fine. Though you really need to use Scope-bound Resource Management concepts (also known as RAII.) Use `std::vector` for dynamic arrays, *always*. And you should probably wrap the CUDA memory into a container as well. If you're in a position to have to free something, you've done it wrong. It should happen automatically. – GManNickG Sep 05 '10 at 04:07
  • @Gman In ideal world, all the code above should all be in straight C will `Malloc()` and `Free`. I am using `new` because it is few char less than `malloc()` in the prototype:) – leon Sep 05 '10 at 05:12
  • @leon: I'm not sure I follow. You use a `std::vector`, it's safer and easier. What do you lose by using it? It takes 2 minutes to make basic a wrapper around some memory. – GManNickG Sep 05 '10 at 05:20
  • @Gman Vector is not raw array afterall. This is a prototype so I want to model it as close to C as possible so I can switch to C very easily later. For example, if I use Vector, then I have to pass &(a[0]) instead of a into cudaMemcpy because not all c++ std::vector name points to the first element of the array. – leon Sep 05 '10 at 05:23
  • @leon: Yes, `&a[0]` or `&a.front()` is quite common. Switching to C isn't going to be any harder, but your C++ will sure be easier. – GManNickG Sep 05 '10 at 06:52

2 Answers2

1
while(i < (*n))
{
  result[i] = (a[i] + b[i] + c[i]);
}

is wrong (infinite)

this is wrong

cudaMemcpy((void**) &sized, &size, sizeof(unsigned), cudaMemcpyHostToDevice);

&sized is address of pointer variable, not pointer value

Single number can be passed to device on the stack, so use

unsigned size

check return status of your cuda functions, http://www.drdobbs.com/high-performance-computing/207603131

Anycorn
  • 50,217
  • 42
  • 167
  • 261
-1

you wrote:

double* a = new double[size];

so, "a" is a pointer to an array of doubles, then you say

  std::generate(a, a+size, RandomNumber);
  std::generate(b, b+size, RandomNumber);
  std::generate(c, c+size, RandomNumber);

which is wrong, you should say

  std::generate(*a, (*a)+size, RandomNumber);
  std::generate(*b, (*b)+size, RandomNumber);
  std::generate(*c, (*c)+size, RandomNumber);

Would be easier to help you if you state what do you want your program to do.

Also, you put

 unsigned * sized;
 std::cout << cudaMalloc((void**) &ad, size*sizeof(double)) << std::endl;

but you could do

 unsigned * sized;
 std::cout << cudaMalloc((void*) ad, size*sizeof(double)) << std::endl;

depending on what you are trying to do.

user290149
  • 99
  • 1
  • 2
  • In C/C++ the name `a` in `a[]` is the pointer that points to the first element of the array, and not pointing to an array of double. Second, `std::generate` takes pointer(iterator) that points to the beginning of the range and the memory address right after the last element in the range. `(*a)+size` = a[0]+size does not make sense for the generate argument. lastly , `cudaMalloc()` takes pointer of pointer so I am passing `&ad`. `(void**) &ad` is not the same as `(void*)ad`. What is the same is `*&ad` and `ad` but this is not what I am doing. – leon Sep 05 '10 at 05:05