1

when I try to copy array of Nodes from device back to host I get zeros in Node.m[...] instead of values, even though when I print node in kernel it shows that values are set properly. Unfortunately I am not able to detect any mistake on my own, so I kindly ask you for help. I compile code with visual studio compiler and compute capability 3. Code from this answer is working for me though.

I paste whole code, but only meaningful parts are

__global__ void divideLeft(Node * nodes,float * leftSide){...}

and

divideLeft<<<1,1>>>(dNodes,dLeftSide);
ERRCHECK(cudaDeviceSynchronize());
ERRCHECK(cudaGetLastError());
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
printNode(nodes[3]);

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <conio.h>
#include <new>
#include <cmath>

#define ERRCHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true,bool wait=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (wait) getch();
      if (abort) exit(code);
   }
}

#define MSIZE 36
#define INPUT_SIZE(N) N*5 - 3*2
#define PARENT(i) (i-1)/2
#define LEFT(i) 2*i + 1
#define RIGHT(i) 2*i + 2
#define BOTTOM_HEAP_NODES_COUNT(N) (N-2)/3 //size of input must be 2+3n,n>1
#define HEAP_SIZE(N) 2*BOTTOM_HEAP_NODES_COUNT(N)-1 
#define FIRST_LEVEL_SIZE 19
#define ROW_LENGTH 5
#define FIRST_LVL_MAT_SIZE 5
#define XY(x,y) x*6+y

__constant__ int dHigherTreeLevelThreshold;
__constant__ int dNodesCount;
__constant__ int dLeftSize;
__constant__ int dHeapSize;
__constant__ int dBottomNodes;
__constant__ int dRemainingNodes;
__constant__ int dRightCols;
__constant__ int dInputCount;

struct Node
{
    float m[MSIZE];
    float *x;
};

__device__ __host__ void printNode(Node node);
__global__ void divideLeft(Node * nodes,float * leftSide)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if(idx>=dBottomNodes)
        return;
    int nodeIdx = idx + dRemainingNodes - (idx >= dHigherTreeLevelThreshold)*dBottomNodes;
//  printf("%d %d\n",idx,nodeIdx);
    Node node = nodes[nodeIdx];
    idx*=5*3;
    node.m[XY(3,3)] = leftSide[idx+2]/3;
    node.m[XY(3,2)] = leftSide[idx+3]/2;
    node.m[XY(3,1)] = leftSide[idx+4];

    node.m[XY(2,3)] = leftSide[idx+6]/2;
    node.m[XY(2,2)] = leftSide[idx+7]*2/3;
    node.m[XY(2,1)] = leftSide[idx+8];
    node.m[XY(2,4)] = leftSide[idx+9];

    node.m[XY(1,3)] = leftSide[idx+10];
    node.m[XY(1,2)] = leftSide[idx+11];
    node.m[XY(1,1)] = leftSide[idx+12];
    node.m[XY(1,4)] = leftSide[idx+13];
    node.m[XY(1,5)] = leftSide[idx+14];

    node.m[XY(4,2)] = leftSide[idx+15];
    node.m[XY(4,1)] = leftSide[idx+16];
    node.m[XY(4,4)] = leftSide[idx+17]*2/3;
    node.m[XY(4,5)] = leftSide[idx+18]/2;

    node.m[XY(5,1)] = leftSide[idx+20];
    node.m[XY(5,4)] = leftSide[idx+21]/2;
    node.m[XY(5,5)] = leftSide[idx+22]/3;
    printNode(node);
}

void leftSideInit(float * leftSide,int size)
{
    for(int i = 0;i<size;i++)
    {
        leftSide[i] = 1;//(i+1)%26;
    }
}

int main(){
    ERRCHECK(cudaSetDevice(0));

    int leftCount = 11;
    int leftSize = leftCount*5;
    int rightSize = 10;
    int heapSize = HEAP_SIZE(leftCount);
    int bottomNodes = BOTTOM_HEAP_NODES_COUNT(leftCount);
    int greatestPowerOfTwo = pow(2,(int)log2(bottomNodes));
    int remainingNodes = heapSize - greatestPowerOfTwo;

    ERRCHECK(cudaMemcpyToSymbol(dBottomNodes,&bottomNodes,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dHigherTreeLevelThreshold,&greatestPowerOfTwo,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dRemainingNodes,&remainingNodes,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dRightCols,&rightSize,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dHeapSize,&heapSize,sizeof(int)));

    float * leftSide = new float[leftSize];
    float * rightSide = new float[rightSize];
    Node * nodes = new Node[heapSize];
    Node * dNodes = nullptr;
    float * dLeftSide =nullptr;
    leftSideInit(leftSide,leftSize);

    ERRCHECK(cudaMalloc(&dNodes,sizeof(Node)* heapSize));
    ERRCHECK(cudaMemset(dNodes,0,sizeof(Node)*heapSize));
    ERRCHECK(cudaMalloc(&dLeftSide,leftSize*sizeof(float)));
    ERRCHECK(cudaMemcpy(dLeftSide,leftSide,leftSize*sizeof(float),cudaMemcpyHostToDevice));
    divideLeft<<<1,1>>>(dNodes,dLeftSide);
    ERRCHECK(cudaDeviceSynchronize());
    ERRCHECK(cudaGetLastError());
    ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
    printNode(nodes[3]);
    delete [] nodes;
    cudaFree(dNodes);

    ERRCHECK(cudaDeviceReset());

    getch();
    return 0;
}

__device__ __host__ void printNode(Node node)
{   
    for(int i= 0;i<6;i++)
        printf("%.3f %.3f %.3f %.3f %.3f %.3f\n",node.m[XY(i,0)],node.m[XY(i,1)],node.m[XY(i,2)],node.m[XY(i,3)],node.m[XY(i,4)],node.m[XY(i,5)]);

}
Community
  • 1
  • 1
quirell
  • 245
  • 1
  • 18

1 Answers1

0

In your kernel you made a local copy of the Node that you are working on:

Node node = nodes[nodeIdx];

the remainder of the kernel proceeds to modify elements of node, your local copy.

But after all the modifications are done, you never copy the local copy back to the global copy, so the global copy remains unmodified.

To fix this, one possibility would be to add this line at the end of your kernel:

nodes[nodeIdx] = node;

As an aside, I note that your struct Node contains a pointer variable:

struct Node
{
    float m[MSIZE];
    float *x;
};

You should be aware of the fact that using an array of structs with embedded pointers may have some special complexities. You're not actually using that variable (x) yet, so I merely mention this as a comment. You may want to refer to the cuda tag info page for a canonical question on this concept ("Using arrays of pointers in CUDA").

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257