I am currently learning OpenCL and I have this kernel that works just fine when directly accessing the global array, but gives wrong results when using an intermediate value on the private memory, for example, aux on the code below.
__kernel void kernel_cte(__global float *U0,__global float *U1,__constant float *VP0, uint stride, uint nnoi, __constant float *g_W, uint k0, uint k1, float FATMDFX, float FATMDFY, float FATMDFZ) {
uint index = get_global_id(1)*nnoi + get_global_id(0) + k0 * stride;
uint k;
float aux;
aux = U0[index+1];
for(k=k0;k<k1;++k) {
if(VP0[index] > 0.0f){
U1[index] = 2.0f * U0[index] - U1[index]
+ FATMDFX * VP0[index] * VP0[index] * (
+ g_W[6] * (U0[index - 6] + U0[index + 6])
+ g_W[5] * (U0[index - 5] + U0[index + 5])
+ g_W[4] * (U0[index - 4] + U0[index + 4])
+ g_W[3] * (U0[index - 3] + U0[index + 3])
+ g_W[2] * (U0[index - 2] + U0[index + 2])
+ g_W[1] * (U0[index - 1] + aux)
+ g_W[0] * U0[index]
)
+ FATMDFY * VP0[index] * VP0[index] * (
+ g_W[6] * (U0[index - 6 * nnoi] + U0[index + 6 * nnoi])
+ g_W[5] * (U0[index - 5 * nnoi] + U0[index + 5 * nnoi])
+ g_W[4] * (U0[index - 4 * nnoi] + U0[index + 4 * nnoi])
+ g_W[3] * (U0[index - 3 * nnoi] + U0[index + 3 * nnoi])
+ g_W[2] * (U0[index - 2 * nnoi] + U0[index + 2 * nnoi])
+ g_W[1] * (U0[index - nnoi] + U0[index + nnoi])
+ g_W[0] * U0[index]
)
+ FATMDFZ * VP0[index] * VP0[index] * (
+ g_W[6] * (U0[index + 6 * stride] + U0[index - 6 * stride])
+ g_W[5] * (U0[index + 5 * stride] + U0[index - 5 * stride])
+ g_W[4] * (U0[index + 4 * stride] + U0[index - 4 * stride])
+ g_W[3] * (U0[index + 3 * stride] + U0[index - 3 * stride])
+ g_W[2] * (U0[index + 2 * stride] + U0[index - 2 * stride])
+ g_W[1] * (U0[index + stride] + U0[index - stride])
+ g_W[0] * U0[index]
);
} // end if
index += stride;
}
}
I would like to use vectors to perform these calculations but I can't understand why the correct value isn't copied to the private memory when I do aux = U0[index+1].