-5

Can someone explain why my kernel doesn't work when my shared memory array of pointers, TMS, is accessed at some index other than the 0th index (happens in the last line)? If TMS[0] is used in the last line, everything works as expected. When I change TMS[0] to any other index, I get a CUDA unexpected error. Assume 64 threads on one block.

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include <cuda_runtime.h>
#include <curand_kernel.h>

__global__ void myKern(float *masterForces)
{
    int globalIdx = ...// set global thread id

    volatile __shared__ float uniques[64];

    {
        uniques[globalIDx] = 0;
    }

    __syncthreads();


    volatile __shared__ float *TMS[64]; 

    {
       TMS[globalIdx] = (&uniques[globalIdx]);
    }

    __syncthreads();

    masterForces[globalIdx] = *TMS[1];
}

Original context if you're curious: (You really don't need to look at this to address my problem)

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>


__global__ void myKern(const float *transMatrix, const int *pointerMatrix, float *masterForces, const double *rands, const int r_max)
{




int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));

volatile __shared__ float uniques[51];

uniques[0] = transMatrix[0]; uniques[1] = transMatrix[1]; uniques[2] = transMatrix[2]; // 1
uniques[3] = transMatrix[3]; uniques[4] = transMatrix[4]; uniques[5] = transMatrix[12]; // 2
uniques[6] = transMatrix[14]; uniques[7] = transMatrix[15]; uniques[8] = transMatrix[24]; // 3
uniques[9] = transMatrix[26]; uniques[10] = transMatrix[27]; uniques[11] = transMatrix[28]; // 4
uniques[12] = transMatrix[40]; uniques[13] = transMatrix[50]; uniques[14] = transMatrix[60]; // 5
uniques[15] = transMatrix[62]; uniques[16] = transMatrix[146]; uniques[17] = transMatrix[156]; // 6
uniques[18] = transMatrix[158]; uniques[19] = transMatrix[168]; uniques[20] = transMatrix[170]; // 7
uniques[21] = transMatrix[172]; uniques[22] = transMatrix[184]; uniques[23] = transMatrix[290]; // 8
uniques[24] = transMatrix[300]; uniques[25] = transMatrix[302]; uniques[26] = transMatrix[312]; // 9
uniques[27] = transMatrix[314]; uniques[28] = transMatrix[316]; uniques[29] = transMatrix[328]; // 10
uniques[30] = transMatrix[1010]; uniques[31] = transMatrix[1020]; uniques[32] = transMatrix[1022]; // 11
uniques[33] = transMatrix[1032]; uniques[34] = transMatrix[1034]; uniques[35] = transMatrix[1036]; // 12
uniques[36] = transMatrix[1048]; uniques[37] = transMatrix[1154]; uniques[38] = transMatrix[1164]; // 13
uniques[39] = transMatrix[1166]; uniques[40] = transMatrix[1176]; uniques[41] = transMatrix[1178]; // 14
uniques[42] = transMatrix[1180]; uniques[43] = transMatrix[1192]; uniques[44] = transMatrix[2018]; // 15
uniques[45] = transMatrix[2028]; uniques[46] = transMatrix[2030]; uniques[47] = transMatrix[2040]; // 16
uniques[48] = transMatrix[2042]; uniques[49] = transMatrix[2044]; uniques[50] = transMatrix[2056]; // 17

__syncthreads();


volatile __shared__ float *TMS[2592]; 

  for (int t=0; t<2592; t++)    
  {
    for (int m=0; m< 51; m++){
       if (pointerMatrix[t] == m)
       {
        TMS[t] = (&uniques[m]);
       }
  }
__syncthreads();


int b0 = 0;
int c0 = 0;
int d0 = 0;
int e0 = 0;
int f0 = 0;
int g0 = 0;
int h0 = 0;
int i0 = 0;
int j0 = 0;
int k0 = 0;
int l0 = 0;
int m0 = 0;
int n0 = 0;
int o0 = 0;
int p0 = 0;
int q0 = 0;
int r0 = 0;
int s0 = 0;
int t0 = 0;
int u0 = 0;
int v0 = 0;
int w0 = 0;
int x0 = 0;
int y0 = 0;




int index = 0;
float r = 0.0;
float temp = 0;

int RUsnapshot = 0; 
int leftsnap = 0;

curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);



for (int i =0; i < 160000; i++) //@@@depends on iterations @@@@@
{
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = b0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((0 * 6 + c0) * 6  + b0) * 2) * 6) ;

            b0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = b0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = c0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + d0) * 6  + c0) * 2) * 6) ;

            c0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = c0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = d0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + e0) * 6  + d0) * 2) * 6) ;

            d0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = d0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = e0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + f0) * 6  + e0) * 2) * 6) ;

            e0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = e0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = f0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + g0) * 6  + f0) * 2) * 6) ;

            f0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = f0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = g0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + h0) * 6  + g0) * 2) * 6) ;

            g0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = g0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = h0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + i0) * 6  + h0) * 2) * 6) ;

            h0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = h0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = i0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + j0) * 6  + i0) * 2) * 6) ;

            i0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = i0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = j0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + k0) * 6  + j0) * 2) * 6) ;

            j0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = j0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = k0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + l0) * 6  + k0) * 2) * 6) ;

            k0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = k0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = l0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + m0) * 6  + l0) * 2) * 6) ;

            l0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = l0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = m0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + n0) * 6  + m0) * 2) * 6) ;

            m0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = m0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = n0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + o0) * 6  + n0) * 2) * 6) ;

            n0 += ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = n0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = o0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + p0) * 6  + o0) * 2) * 6) ;

            o0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = o0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = p0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + q0) * 6  + p0) * 2) * 6) ;

            p0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = p0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = q0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + r0) * 6  + q0) * 2) * 6) ;

            q0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = q0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = r0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + s0) * 6  + r0) * 2) * 6) ;

            r0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = r0;
        ///////////////////////////////////////////////////////////    
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = s0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + t0) * 6  + s0) * 2) * 6) ;

            s0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = s0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = t0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + u0) * 6  + t0) * 2) * 6) ;

            t0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = t0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot =u0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + v0) * 6  + u0) * 2) * 6) ;

            u0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap =u0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = v0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + w0) * 6  + v0) * 2) * 6) ;

            v0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = v0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = w0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + x0) * 6  + w0) * 2) * 6) ;

            w0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = w0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = x0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + y0) * 6  + x0) * 2) * 6) ;

            x0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;

            leftsnap = x0;
        ///////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////
            r = curand_uniform(&s);

            RUsnapshot = y0;

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + 0) * 6  + y0) * 2) * 6) ;

            y0+= ( r < *TMS[index]) * (*TMS[index + 1]) +
                (! (r < *TMS[index])) * ( r < *TMS[index + 2]) * (*TMS[index + 3]) +
                (! ( r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ;


        ///////////////////////////////////////////////////////////   






            temp = (b0 ==4) + (b0 ==5) + (c0 ==4) + (c0 ==5) + (d0 ==4) + (d0 ==5) + (e0 ==4) + (e0 ==5) + (f0 ==4) + (f0 ==5) + 
                   (g0 ==4) + (g0 ==5) + (h0 ==4) + (h0 ==5) + (i0 ==4) + (i0 ==5) + (j0 ==4) + (j0 ==5) + (k0 ==4) + (k0 ==5) + 
                   (l0 ==4) + (l0 ==5) + (m0 ==4) + (m0 ==5) + (n0 ==4) + (n0 ==5) + (o0 ==4) + (o0 ==5) + (p0 ==4) + (p0 ==5) + 
                   (q0 ==4) + (q0 ==5) + (r0 ==4) + (r0 ==5) + (s0 ==4) + (s0 ==5) + (t0 ==4) + (t0 ==5) + (u0 ==4) + (u0 ==5) + 
                   (v0 ==4) + (v0 ==5) + (w0 ==4) + (w0 ==5) + (x0 ==4) + (x0 ==5) + (y0 ==4) + (y0 ==5);


        masterForces[globalIdx + (r_max * i)] = *TMS[1]; 
        temp = 0.0;

}



}

}
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
Jordan
  • 305
  • 3
  • 13
  • 1
    Post a complete code that demonstrates the error please. SO expects this. – Robert Crovella Jul 07 '14 at 17:54
  • Using a *global* index as an offset in a *shared memory* array usually is not a good sign... – user703016 Jul 07 '14 at 17:54
  • Okay, but I warn you, it's pretty complicated. The code above should capture the problem. – Jordan Jul 07 '14 at 17:55
  • 4
    The objective is to provide the *shortest* code that demonstrates the problem, just as I have provided a short code that demonstrates the solution. Please note this for future postings, and don't just dump your code into the question. Yes, it requires effort on your part, but SO expects this, and it will help others help you. – Robert Crovella Jul 07 '14 at 18:13

2 Answers2

4

Assuming your global thread index (globalIdx) is in fact a global thread index, such as something like:

int globalIdx = threadIdx.x+blockDim.x*blockIdx.x; // e.g. for 1D grid/threadblock

then you can't index into a 64 entry shared memory array when your global thread index exceeds 63:

    uniques[globalIDx] = 0;

or

   TMS[globalIdx] = (&uniques[globalIdx]);

If instead you use just the thread index:

    uniques[threadIdx.x] = 0;

and similarly elsewhere in your code, you should be OK at least for indexing into the shared memory arrays, assuming you have 64 threads per block.

Here's a fully worked example:

$ cat t463.cu
#include <stdio.h>
#define DSIZE 128
#define nTPB 64
__global__ void myKern(float *masterForces)
{
    int globalIdx = threadIdx.x+blockDim.x*blockIdx.x;

    volatile __shared__ float uniques[nTPB];

    {
        uniques[threadIdx.x] = 0;
    }

    __syncthreads();


    volatile __shared__ float *TMS[nTPB];

    {
       TMS[threadIdx.x] = &(uniques[threadIdx.x]);
    }

    __syncthreads();

    masterForces[globalIdx] = *TMS[1];
}

int main(){

  float *d_data, *h_data;
  h_data=(float *)malloc(DSIZE*sizeof(float));
  cudaMalloc(&d_data, DSIZE*sizeof(float));
  for (int i = 0; i< DSIZE; i++)
    h_data[i] = 1.0f;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  myKern<<<DSIZE/nTPB, nTPB>>>(d_data);
  cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i<DSIZE; i++)
    if (h_data[i] != 0.0f) {printf("mismatch at %d, was: %f should be: %f\n", i, h_data[i], 0.0f); return 1;}
  printf("Success\n");
  return 0;
}
$ nvcc -arch=sm_20 -o t463 t463.cu
$ cuda-memcheck ./t463
========= CUDA-MEMCHECK
Success
========= ERROR SUMMARY: 0 errors
$

In the future, please provide a fully worked example demonstrating the problem, for questions like this. SO expects this, and it's a valid close reason for a question if you don't provide it.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Sorry for late reply, was walking my dog. Wicked reply, thanks Robert. There's just one thing I do not understand. Why doesn't globalIdx work? If I have 64 threads, globalIdx should never exceed 63. Can you please help me understand. – Jordan Jul 07 '14 at 19:00
  • `globalIdx` does work if you limit your *entire kernel* to 64 threads. To prove that, just change the `#define DSIZE` in my code to `64`, and then you can use `globalIdx` everywhere that I have `threadIdx.x`. (the net effect of this would be only launching one block) But if your entire kernel launch (i.e. grid) involves more than 64 threads, you cannot use the global index to index into a shared memory array. As soon as you go to the second block, the global indices start at 64 and go to 127 (in this example). – Robert Crovella Jul 07 '14 at 20:05
0

Your example code should not give any errors as was proved by Robert Crovella. I looked at your original code and found a misaligned parenthesis cause by that first "for" loop. Good luck...

Jordan
  • 305
  • 3
  • 13