I wrote an OpenCL code to solve advection eqution using two different schemes: Upstream bias and Leapfrog scheme.
The code runs fine, but I would like to know if I could use OpenCL local memory to optimize the code. From my understanding, local memory is useful when we have something that could be shared within the local workgroup. But in my OpenCL kernel, other than the indexes, I do not have (or I could not think of, per se..) anything that should/could be shared.
Kernel for upstream bias scheme
kernel void upstream3d(
const int nx,
const int ny,
const int nz,
global float *in_p_tf,
global float *in_p_tn,
const float u_vel,
const float v_vel,
const float w_vel,
const float C
)
{
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx, idx_i, idx_j, idx_k;
int c_xi = i % nx,
c_yi = j % ny,
c_zi = k % nz,
m_xi = (i+nx-1)%nx,
m_yi = (j+ny-1)%ny,
m_zi = (k+nz-1)%nz;
idx = c_xi + c_yi * nx + c_zi * nx * ny;
idx_i = m_xi + c_yi * nx + c_zi * nx * ny;
idx_j = c_xi + m_yi * nx + c_zi * nx * ny;
idx_k = c_xi + c_yi * nx + m_zi * nx * ny;
in_p_tf[idx] = in_p_tn[idx]
- u_vel * C * (in_p_tn[idx] - in_p_tn[idx_i])
- v_vel * C * (in_p_tn[idx] - in_p_tn[idx_j])
- w_vel * C * (in_p_tn[idx] - in_p_tn[idx_k]);
}
Kernel for Leapfrog scheme
kernel void leapfrog3d(
const int nx,
const int ny,
const int nz,
global float *in_p_tf,
global float *in_p_tn,
global float *in_p_tp,
const float u_vel,
const float v_vel,
const float w_vel,
const float C
)
{
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;
int p_xi = (i+1)%nx,
p_yi = (j+1)%ny,
p_zi = (k+1)%nz,
c_xi = i % nx,
c_yi = j % ny,
c_zi = k % nz,
m_xi = (i+nx-1)%nx,
m_yi = (j+ny-1)%ny,
m_zi = (k+nz-1)%nz;
idx0 = c_xi + c_yi * nx + c_zi * nx * ny;
idx_i0 = p_xi + c_yi * nx + c_zi * nx * ny;
idx_j0 = c_xi + p_yi * nx + c_zi * nx * ny;
idx_k0 = c_xi + c_yi * nx + p_zi * nx * ny;
idx_i1 = m_xi + c_yi * nx + c_zi * nx * ny;
idx_j1 = c_xi + m_yi * nx + c_zi * nx * ny;
idx_k1 = c_xi + c_yi * nx + m_zi * nx * ny;
in_p_tf[idx0] = in_p_tp[idx0]
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
in_p_tn[i + j * nx + k * nx * ny] = in_p_tn[i + j * nx + k * nx * ny]
+ 0.80 * (in_p_tf[i + j * nx + k * nx * ny]
- 2.0 * in_p_tn[i + j * nx + k * nx * ny]
+ in_p_tp[i + j * nx + k * nx * ny]);
}
Is this all I can get from OpenCL, or am I missing something?
Thanks.