#define dimG 16
#define dimB 64
// slovebyGPU
__global__ void SloveStepGPU(float* X, float* Y, int * iCons, int* jCons, int * dCons, float* wCons, int cnt, float c)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = id; i<cnt; i += dimG*dimB) {
int I = iCons[i];
int J = jCons[i];
int d = dCons[i];
float wc = 1.0f*wCons[i]*c;
if (wc > 1.0)wc = 1.0;
float XI = atomicAdd(&(X[I]), 0);
float XJ = atomicAdd(&(X[J]), 0);
float YI = atomicAdd(&(Y[I]), 0);
float YJ = atomicAdd(&(Y[J]), 0);
float pqx = XI - XJ;
float pqy = YI - YJ;
float mag = sqrtf(pqx*pqx + pqy*pqy);
float r = 1.0f*(d - mag) / 2;
float mx = wc * r * pqx / (mag + eps);
float my = wc * r * pqy / (mag + eps);
if (d == 1) {
atomicAdd(&(X[I]), mx);
atomicAdd(&(Y[I]), my);
}
atomicAdd(&(X[J]), -mx);
atomicAdd(&(Y[J]), -my);
}
In this code, I know that X, Y may have data races. My previous thought was: Allowed reading of XI, XJ, YI, YJ may not be the latest data. However, I found that in the process of data race, it may cause XI, XJ, YI, YJ to read random memory values. That is, a memory access violation. Even if I add a lock during reading and writing, I still get the same result. Only when I reduce the size of dimB and dimG so that there is almost no data race, can I get the correct result. Is there any solution?
I use 64-bit compilation under windows + vs2015 + cuda9.1 environment.
However, I used the same code under linux and found no problems.
There is no problem when using nsight cuda debugger under windows. The reason is probably that running with debugger is slow and does not cause data race.
-------update line----- delete other code