I implemented 2D Gaussian Filter on OpenCL using two 1D gaussian filters(gaussian separability).
I implemented 2 version of my convolution: - the first one uses one kernel that applies 1D filter on rows and then it transposes the image (it takes about 20 ms)
- the second one uses two kernel: one applies convolution and one transposes the image (it takes about 7ms - convolution and about 1ms - to transpose the image )
I evaluated the computation time of both implementation and I figure out that the implementation using two kernel is faster than one that uses one kernel (notice that the transpose kernel has to wait the convolution kernel ).
Could you help me to figure out why the implementation using one kernel is slower, even if the setup time of just one kernel should be faster than the two kernels setup time.
Find below the OpenCL source code of the both implementations:
One kernel Implementation
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,__constant int4 *par, __constant float *filter,__local float4 *cache, __local float4 *temp,__write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int2 location; float4 focus,set; float *s = &set; int m,start; int widthGM = widthG + (margin<<1); const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); cache[(iG+margin) + jG*widthGM] = focus; if(iG == 0) { if(i == 0) { cache[0 + jG*widthGM].x = focus.x; cache[0 + jG*widthGM].y = focus.x; cache[0 + jG*widthGM].z = focus.x; cache[0 + jG*widthGM].w = focus.x; cache[1 + jG*widthGM].x = focus.x; cache[1 + jG*widthGM].y = focus.x; cache[1 + jG*widthGM].z = focus.x; cache[1 + jG*widthGM].w = focus.x; } else { location.x = i-margin; location.y = j; cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i-(margin-1); location.y = j; cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } if(iG == ((widthGM-1)-(margin<<1))) { if(i == (width-1)) { cache[(widthGM-2) + jG*widthGM].x = focus.w; cache[(widthGM-2) + jG*widthGM].y = focus.w; cache[(widthGM-2) + jG*widthGM].z = focus.w; cache[(widthGM-2) + jG*widthGM].w = focus.w; cache[(widthGM-1) + jG*widthGM].x = focus.w; cache[(widthGM-1) + jG*widthGM].y = focus.w; cache[(widthGM-1) + jG*widthGM].z = focus.w; cache[(widthGM-1) + jG*widthGM].w = focus.w; } else { location.x = i+margin; location.y = j; cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i+(margin-1); location.y = j; cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } barrier(CLK_LOCAL_MEM_FENCE); float4 bar[10],barX[10],barY[10],barZ[10],barW[10]; float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW; float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW; float4 gauss[4]; float *gf = &gauss; float4 acc; gf[0] = filter[0]; gf[1] = filter[1]; gf[2] = filter[2]; gf[3] = filter[3]; gf[4] = filter[4]; gf[5] = filter[5]; gf[6] = filter[6]; gf[7] = filter[7]; gf[8] = filter[8]; gf[9] = filter[9]; gf[10]= filter[10]; gf[11]= filter[11]; gf[12]= filter[12]; gf[13]= filter[13]; gf[14]= filter[14]; gf[15]= 0.0f; start = iG + jG*widthGM; fX[0] = cache[start+0].y; fX[1] = fY[0] = cache[start+0].z; fX[2] = fY[1] = fZ[0] = cache[start+0].w; fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x; fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y; fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z; fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w; fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x; fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y; fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z; fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w; fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x; fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y; fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z; fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w; fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x; fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y; fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z; fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f; acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15]; acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15]; acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15]; acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15]; temp[iG + jG*widthG] = acc; barrier(CLK_LOCAL_MEM_FENCE); int I,S; I = j >> 2; S = j & 3; if(S == 0) { set.x = temp[iG + (jG+0)*widthG].x; set.y = temp[iG + (jG+1)*widthG].x; set.z = temp[iG + (jG+2)*widthG].x; set.w = temp[iG + (jG+3)*widthG].x; } if(S == 1) { set.x = temp[iG + (jG-1)*widthG].y; set.y = temp[iG + (jG+0)*widthG].y; set.z = temp[iG + (jG+1)*widthG].y; set.w = temp[iG + (jG+2)*widthG].y; } if(S == 2) { set.x = temp[iG + (jG-2)*widthG].z; set.y = temp[iG + (jG-1)*widthG].z; set.z = temp[iG + (jG+0)*widthG].z; set.w = temp[iG + (jG+1)*widthG].z; } if(S == 3) { set.x = temp[iG + (jG-3)*widthG].w; set.y = temp[iG + (jG-2)*widthG].w; set.z = temp[iG + (jG-1)*widthG].w; set.w = temp[iG + (jG+0)*widthG].w; } location.x = I; location.y = (i*4 + S); write_imagef(dstBuffer, location , set); }
Two Kernels implementation
Convolution:
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer, __constant int4 *par, __constant float *filter, __local float4 *cache, __local float4 *temp, __write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int2 location; float4 focus,set; float *s = &set; int m,start; int widthGM = widthG + (margin<<1); const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); cache[(iG+margin) + jG*widthGM] = focus; if(iG == 0) { if(i == 0) { cache[0 + jG*widthGM].x = focus.x; cache[0 + jG*widthGM].y = focus.x; cache[0 + jG*widthGM].z = focus.x; cache[0 + jG*widthGM].w = focus.x; cache[1 + jG*widthGM].x = focus.x; cache[1 + jG*widthGM].y = focus.x; cache[1 + jG*widthGM].z = focus.x; cache[1 + jG*widthGM].w = focus.x; } else { location.x = i-margin; location.y = j; cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i-(margin-1); location.y = j; cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } if(iG == ((widthGM-1)-(margin<<1))) { if(i == (width-1)) { cache[(widthGM-2) + jG*widthGM].x = focus.w; cache[(widthGM-2) + jG*widthGM].y = focus.w; cache[(widthGM-2) + jG*widthGM].z = focus.w; cache[(widthGM-2) + jG*widthGM].w = focus.w; cache[(widthGM-1) + jG*widthGM].x = focus.w; cache[(widthGM-1) + jG*widthGM].y = focus.w; cache[(widthGM-1) + jG*widthGM].z = focus.w; cache[(widthGM-1) + jG*widthGM].w = focus.w; } else { location.x = i+margin; location.y = j; cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); location.x = i+(margin-1); location.y = j; cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location); } } barrier(CLK_LOCAL_MEM_FENCE); float4 bar[10],barX[10],barY[10],barZ[10],barW[10]; float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW; float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW; float4 gauss[4]; float *gf = &gauss; float4 acc; gf[0] = filter[0]; gf[1] = filter[1]; gf[2] = filter[2]; gf[3] = filter[3]; gf[4] = filter[4]; gf[5] = filter[5]; gf[6] = filter[6]; gf[7] = filter[7]; gf[8] = filter[8]; gf[9] = filter[9]; gf[10]= filter[10]; gf[11]= filter[11]; gf[12]= filter[12]; gf[13]= filter[13]; gf[14]= filter[14]; gf[15]= 0.0f; start = iG + jG*widthGM; fX[0] = cache[start+0].y; fX[1] = fY[0] = cache[start+0].z; fX[2] = fY[1] = fZ[0] = cache[start+0].w; fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x; fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y; fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z; fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w; fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x; fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y; fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z; fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w; fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x; fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y; fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z; fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w; fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x; fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y; fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z; fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f; acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15]; acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15]; acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15]; acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15]; location.x = i; location.y = j; write_imagef(dstBuffer, location , acc); }
Transpose:
__kernel void TransponseKernel(__read_only image2d_t srcBuffer, __constant int4 *par, __local float4 *temp, __write_only image2d_t dstBuffer) { int width = par[0].x; int height = par[0].y; int widthG = par[0].z; int heightG = par[0].w; int gaussFilterWidth = par[1].x; int margin = 1 + (gaussFilterWidth>>2); int widthGM = widthG + (margin<<1); int2 location; float4 focus,set; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST; int i = get_global_id(0); int j = get_global_id(1); int iG = get_local_id(0); int jG = get_local_id(1); location.x = i; location.y = j; focus = read_imagef(srcBuffer, sampler, location); temp[iG + jG*widthG] = focus; barrier(CLK_LOCAL_MEM_FENCE); int I,S; I = j >> 2; S = j & 3; if(S == 0) { set.x = temp[iG + (jG+0)*widthG].x; set.y = temp[iG + (jG+1)*widthG].x; set.z = temp[iG + (jG+2)*widthG].x; set.w = temp[iG + (jG+3)*widthG].x; } if(S == 1) { set.x = temp[iG + (jG-1)*widthG].y; set.y = temp[iG + (jG+0)*widthG].y; set.z = temp[iG + (jG+1)*widthG].y; set.w = temp[iG + (jG+2)*widthG].y; } if(S == 2) { set.x = temp[iG + (jG-2)*widthG].z; set.y = temp[iG + (jG-1)*widthG].z; set.z = temp[iG + (jG+0)*widthG].z; set.w = temp[iG + (jG+1)*widthG].z; } if(S == 3) { set.x = temp[iG + (jG-3)*widthG].w; set.y = temp[iG + (jG-2)*widthG].w; set.z = temp[iG + (jG-1)*widthG].w; set.w = temp[iG + (jG+0)*widthG].w; } location.x = I; location.y = (i*4 + S); write_imagef(dstBuffer, location , set); }