0

i am tring to build a cuda program to do ray-tracing, and i have some code below:

void build_world(World *w, RGBAColor* buffer){  
w->vp = (ViewPlane*) malloc(sizeof(ViewPlane));

w->vp->hres = 512;
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;

ViewPlane *viewplane;
cudaMalloc(&viewplane,sizeof(ViewPlane)); //return cudaSuccess but pointer still NULL
cudaMemcpy(viewplane,w->vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
free(w->vp);
w->vp = viewplane;

cudaMalloc(&(w->background_color),sizeof(RGBAColor)); //return cudaSuccess but pointer still NULL
*(w->background_color) = black;  //Memory access error

cudaMalloc(&(w->sphere),sizeof(Sphere));  //return cudaSuccess but pointer still NULL
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}

World *w is a static global pointer, and it is in the global memory. My problem is that i can not allocate memory in device memory, all "cudaMalloc" calls do not work for most of the time.


i do what @RobertCrovella suggested in comment, like this:

void build_world(World *w, RGBAColor* buffer){

    checkCudaErrors( cudaMalloc(&(w->vp),sizeof(ViewPlane)));
    getLastCudaError("viewplane allocate failed");

    w->vp->hres = 512;  //memory access errors occurs here
    w->vp->vres = 512;
    w->vp->buffer = buffer;
    w->vp->s = 1;       

    checkCudaErrors( cudaMalloc(&(w->background_color),sizeof(RGBAColor)));
    getLastCudaError("background allocate failed");
    *(w->background_color) = black;

    checkCudaErrors( cudaMalloc(&(w->sphere),sizeof(Sphere)));
    getLastCudaError("sphere allocate failed");

    w->sphere->center = Point3D(0.0,0.0,0.0);
    w->sphere->radius = 300;
}

and it works once...the cudaMalloc API still returns "cudaSuccess" when it's not.

here is the definitions of structure:

typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
    Point3D center;
    float radius;
};
struct ViewPlane{
public:
    int hres;
    int vres;
    float s;
    //float gamma;
    //float inv_gamma;

    RGBAColor *buffer;

};
struct World{
public:

    ViewPlane *vp;
    RGBAColor *background_color;
    Sphere *sphere;

};

after considering the issues that @RobertCrovella mentions in the answer below, here is the third version of build_world:

struct World{
public:

    ViewPlane *vp;
    RGBAColor background_color;
    Sphere *sphere;

};
void build_world(World *w, RGBAColor* buffer){  
    World *h_world;
    h_world = (World*)malloc(sizeof(World));

    ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
    h_vp->hres = 512;
    h_vp->vres = 512;
    h_vp->buffer = buffer;
    h_vp->s = 1;        
    checkCudaErrors( cudaMalloc(&(h_world->vp),sizeof(ViewPlane)));
    getLastCudaError("viewplane allocate failed");
    checkCudaErrors( cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice));
    getLastCudaError("viewplane memory copy failed");

    h_world->background_color = black;

    Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
    h_sphere->center = Point3D(0.0,0.0,0.0);
    h_sphere->radius = 300;
    checkCudaErrors( cudaMalloc(&(h_world->sphere),sizeof(Sphere)));
    getLastCudaError("sphere allocate failed");
    checkCudaErrors( cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice));
    getLastCudaError("sphere memory copy failed");

    checkCudaErrors( cudaMalloc( &w , sizeof(World)));
    getLastCudaError( "world allocate failed" );
    checkCudaErrors( cudaMemcpy(w,h_world,sizeof(World),cudaMemcpyHostToDevice));
    getLastCudaError("world memory copy failed");

    free(h_world);free(h_vp);free(h_sphere);    
}

this time, all cudaMemcpy calls don't work: when running to the end of this function, the value of h_vp and h_sphere is good; h_world->vp and h_world->sphere do point to an area of device momery but contains wrong value;w does not have correct value, all pointer it contains is 0x00000000...

Clones1201
  • 333
  • 3
  • 17
  • 1. do [cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) on all cuda calls and kernel calls 2. report the specific line that the error(s)are occurring on and the specific error message that is displayed when you do cuda error checking on that line 3. provide all relevant structure/class definitions, such as `World` in this case. – Robert Crovella Mar 23 '13 at 02:27
  • it works when i do the "checkCudaErrors" to all the "cudaMalloc", why? – Clones1201 Mar 23 '13 at 02:49
  • @RobertCrovella it only works once... – Clones1201 Mar 23 '13 at 05:39
  • 1
    The two versions of `build_world` you posted differ in very important ways, apart from just the error checking I asked you to add. You have made significant mistakes in the second code posting that were not present in the first code posting. So I started to try to answer this but found the code too confusing to try to refer to in my answer. Also, I need to understand how you are calling `build_world` and the actual allocation of the pointers you are passing to it. If `*w` is a pointer to device global memory, none of this will work as written. – Robert Crovella Mar 23 '13 at 15:17

1 Answers1

0

This question has officially become "a mess" because you have posted two substantially different versions of build_world which differ in important ways, apart from just the error checking I asked you to add. I will try and address some issues as I see them, however my understanding is clouded by the confusion in your posting.

  • If the pointer *w that you are passing to build_world is already a device pointer (i.e. allocated with cudaMalloc) which seems to be what you are saying, then none of this will work. Creating data structures on the device, which also contain pointers to other data structures that are also on the device, is a somewhat non-intuitive process. You cannot pass a pointer to cudaMalloc that already lives on the device (i.e. is already part of a region created with cudaMalloc. Instead it's necessary to create a parallel set of pointers on the host, cudaMalloc these pointers individually, then copy the pointer values to the appropriate regions in the device data structure, using cudaMemcpy. To see another example of what I am referring to, take a look here.
  • You cannot dereference device pointers in host code. For example:

    w->vp->hres = 512;
    

    If w or w->vp is a pointer set up with cudaMalloc, then the above operation is invalid. Instead it's necessary to create a parallel data structure on the host, set the values there, then cudaMemcpy from host to device:

    h_vp->hres = 512;
    cudaMemcpy(d_vp, h_vp, sizeof(vp_struct), cudaMemcpyHostToDevice);
    

    Note that in this simplified description I'm glossing over the issue I mentioned in the first point above.

  • If you are calling build_world over and over again, you need to make sure that you are properly using cudaFree if you are passing the same *w pointer.

EDIT: In response to the additional posting of the 3rd version of build_world I elected to create a sample code which should have the remaining issues fixed:

#include <stdio.h>
#include <vector_functions.h>

#define black make_uchar4(4,3,2,1)
#define white make_uchar4(0,1,2,3)

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
    Point3D center;
    float radius;
};
struct ViewPlane{
public:
    int hres;
    int vres;
    float s;
    //float gamma;
    //float inv_gamma;

    RGBAColor *buffer;

};

struct World{
public:

    ViewPlane *vp;
    RGBAColor background_color;
    Sphere *sphere;

};

__global__ void my_kernel(World *w){

  printf("w->vp->hres = %d\n", w->vp->hres);
  printf("w->background_color.y = %d\n", w->background_color.y);
  printf("w->sphere->radius = %f\n", w->sphere->radius);
  printf("w->vp->buffer->y = %d\n", w->vp->buffer->y);

}


void build_world(World **w, RGBAColor* buffer){
    World *h_world;
    h_world = (World*)malloc(sizeof(World));

    ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
    h_vp->hres = 512;
    h_vp->vres = 512;
    h_vp->s = 1;
    cudaMalloc((void **)&(h_vp->buffer), sizeof(RGBAColor));
    cudaCheckErrors("viewplane RGBAColor allocate failed");
    cudaMemcpy(h_vp->buffer, buffer, sizeof(RGBAColor), cudaMemcpyHostToDevice);
    cudaCheckErrors("viewplane RGBAColor copy failed");

    cudaMalloc((void **)&(h_world->vp),sizeof(ViewPlane));
    cudaCheckErrors("viewplane allocate failed");
    cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
    cudaCheckErrors("viewplane memory copy failed");

    h_world->background_color = black;

    Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
    h_sphere->center = (Point3D) make_float3(0.0,0.0,0.0);
    h_sphere->radius = 300;
    cudaMalloc((void **)&(h_world->sphere),sizeof(Sphere));
    cudaCheckErrors("sphere allocate failed");
    cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice);
    cudaCheckErrors("sphere memory copy failed");

    cudaMalloc((void **)w , sizeof(World));
    cudaCheckErrors( "world allocate failed" );
    cudaMemcpy(*w,h_world,sizeof(World),cudaMemcpyHostToDevice);
    cudaCheckErrors("world memory copy failed");

    free(h_world);free(h_vp);free(h_sphere);
}



int main(){

   World *d_w;
   RGBAColor my_buffer = white;

   build_world(&d_w, &my_buffer);
   my_kernel<<<1,1>>>(d_w);
   cudaDeviceSynchronize();
   cudaCheckErrors("kernel fail");
   return 0;
}

You can compile this code with nvcc -arch=sm_20 -o t98 t98.cu

When I compile and run this code, I get no errors and the following output:

$ ./t98
w->vp->hres = 512
w->background_color.y = 3
w->sphere->radius = 300.000000
w->vp->buffer->y = 1
$ 
Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • `World *w` the pointer itself i think is in host memory, and it should point to an area of device memory. i am trying to allocate device memory for pointer `*w` within the `build_world` ,so i have to call `cudaMalloc` for `*w` within the `build_world`... i post a new version of `build_world` and it brings another problem... – Clones1201 Mar 24 '13 at 16:29
  • Your 3rd posting of `build_world` was pretty close to being correct. Rather than try to explain the final issues, I elected to update my answer with a working sample code, which should have the remaining issues addressed. – Robert Crovella Mar 24 '13 at 21:08