OpenCL gaussian convolution using one Kernel is slower

Question

I implemented 2D Gaussian Filter on OpenCL using two 1D gaussian filters(gaussian separability).

I implemented 2 version of my convolution: - the first one uses one kernel that applies 1D filter on rows and then it transposes the image (it takes about 20 ms)

the second one uses two kernel: one applies convolution and one transposes the image (it takes about 7ms - convolution and about 1ms - to transpose the image )

I evaluated the computation time of both implementation and I figure out that the implementation using two kernel is faster than one that uses one kernel (notice that the transpose kernel has to wait the convolution kernel ).

Could you help me to figure out why the implementation using one kernel is slower, even if the setup time of just one kernel should be faster than the two kernels setup time.

Find below the OpenCL source code of the both implementations:

One kernel Implementation

__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,__constant int4 *par, __constant float *filter,__local float4 *cache, __local float4 *temp,__write_only image2d_t dstBuffer)
{
    int width = par[0].x;
    int height = par[0].y;
    int widthG = par[0].z;
    int heightG = par[0].w;
    int gaussFilterWidth = par[1].x;
    int margin = 1 + (gaussFilterWidth>>2);
    int2 location;
    float4 focus,set;
    float *s = &set;
    int m,start;
    int widthGM = widthG + (margin<<1);

    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;

    int i = get_global_id(0);
    int j = get_global_id(1);

    int iG = get_local_id(0);
    int jG = get_local_id(1);

    location.x = i;
    location.y = j;

    focus = read_imagef(srcBuffer, sampler, location);

    cache[(iG+margin) + jG*widthGM] = focus;

    if(iG == 0)
    {
        if(i == 0)
        {
            cache[0 + jG*widthGM].x = focus.x;
            cache[0 + jG*widthGM].y = focus.x;
            cache[0 + jG*widthGM].z = focus.x;
            cache[0 + jG*widthGM].w = focus.x;
            cache[1 + jG*widthGM].x = focus.x;
            cache[1 + jG*widthGM].y = focus.x;
            cache[1 + jG*widthGM].z = focus.x;
            cache[1 + jG*widthGM].w = focus.x;
        }
        else
        {
            location.x = i-margin;
            location.y = j;
            cache[jG*widthGM] =  read_imagef(srcBuffer, sampler, location);

            location.x = i-(margin-1);
            location.y = j;
            cache[1 + jG*widthGM] =  read_imagef(srcBuffer, sampler, location);
        }
    }
    if(iG == ((widthGM-1)-(margin<<1)))
    {
        if(i == (width-1))
        {
            cache[(widthGM-2) + jG*widthGM].x = focus.w;
            cache[(widthGM-2) + jG*widthGM].y = focus.w;
            cache[(widthGM-2) + jG*widthGM].z = focus.w;
            cache[(widthGM-2) + jG*widthGM].w = focus.w;
            cache[(widthGM-1) + jG*widthGM].x = focus.w;
            cache[(widthGM-1) + jG*widthGM].y = focus.w;
            cache[(widthGM-1) + jG*widthGM].z = focus.w;
            cache[(widthGM-1) + jG*widthGM].w = focus.w;
        }
        else
        {
            location.x = i+margin;
            location.y = j;
            cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);

            location.x = i+(margin-1);
            location.y = j;
            cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    float4  bar[10],barX[10],barY[10],barZ[10],barW[10];
    float4  *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW;
    float   *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW;
    float4  gauss[4];
    float   *gf = &gauss;
    float4  acc;


    gf[0] = filter[0];
    gf[1] = filter[1];
    gf[2] = filter[2];
    gf[3] = filter[3];
    gf[4] = filter[4];
    gf[5] = filter[5];
    gf[6] = filter[6];
    gf[7] = filter[7];
    gf[8] = filter[8];
    gf[9] = filter[9];
    gf[10]= filter[10];
    gf[11]= filter[11];
    gf[12]= filter[12];
    gf[13]= filter[13];
    gf[14]= filter[14];
    gf[15]= 0.0f;


    start = iG + jG*widthGM;


    fX[0]                                             = cache[start+0].y;
    fX[1] = fY[0]                                                     = cache[start+0].z;
    fX[2] = fY[1] = fZ[0]                                                 = cache[start+0].w;
    fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x;
    fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y;
    fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z;
    fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w;
    fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x;
    fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y;
    fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z;
    fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w;
    fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x;
    fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y;
    fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z;
    fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w;
    fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x;
    fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y;
    fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z;
    fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f;

    acc.x =  fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15];
    acc.y =  fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15];
    acc.z =  fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15];
    acc.w =  fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15];


    temp[iG + jG*widthG] = acc;

    barrier(CLK_LOCAL_MEM_FENCE);

    int I,S;

    I = j >> 2;
    S = j & 3;

    if(S == 0)
    {
        set.x = temp[iG + (jG+0)*widthG].x;
        set.y = temp[iG + (jG+1)*widthG].x;
        set.z = temp[iG + (jG+2)*widthG].x;
        set.w = temp[iG + (jG+3)*widthG].x;
    }
    if(S == 1)
    {
        set.x = temp[iG + (jG-1)*widthG].y;
        set.y = temp[iG + (jG+0)*widthG].y;
        set.z = temp[iG + (jG+1)*widthG].y;
        set.w = temp[iG + (jG+2)*widthG].y;
    }
    if(S == 2)
    {
        set.x = temp[iG + (jG-2)*widthG].z;
        set.y = temp[iG + (jG-1)*widthG].z;
        set.z = temp[iG + (jG+0)*widthG].z;
        set.w = temp[iG + (jG+1)*widthG].z;
    }
    if(S == 3)
    {
        set.x = temp[iG + (jG-3)*widthG].w;
        set.y = temp[iG + (jG-2)*widthG].w;
        set.z = temp[iG + (jG-1)*widthG].w;
        set.w = temp[iG + (jG+0)*widthG].w;
    }


    location.x = I;
    location.y = (i*4 + S);

    write_imagef(dstBuffer, location , set);

}

Two Kernels implementation

Convolution:

__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,
                                __constant int4 *par,
                                __constant float *filter,
                                __local float4 *cache,
                                __local float4 *temp,
                                __write_only image2d_t dstBuffer)
{
    int width = par[0].x;
    int height = par[0].y;
    int widthG = par[0].z;
    int heightG = par[0].w;
    int gaussFilterWidth = par[1].x;
    int margin = 1 + (gaussFilterWidth>>2);

    int2 location;

    float4 focus,set;
    float *s = &set;
    int m,start;
    int widthGM = widthG + (margin<<1);

    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;

    int i = get_global_id(0);
    int j = get_global_id(1);

    int iG = get_local_id(0);
    int jG = get_local_id(1);



    location.x = i;
    location.y = j;

    focus = read_imagef(srcBuffer, sampler, location);


    cache[(iG+margin) + jG*widthGM] = focus;


    if(iG == 0)
    {
        if(i == 0)
        {


            cache[0 + jG*widthGM].x = focus.x;
            cache[0 + jG*widthGM].y = focus.x;
            cache[0 + jG*widthGM].z = focus.x;
            cache[0 + jG*widthGM].w = focus.x;
            cache[1 + jG*widthGM].x = focus.x;
            cache[1 + jG*widthGM].y = focus.x;
            cache[1 + jG*widthGM].z = focus.x;
            cache[1 + jG*widthGM].w = focus.x;
        }
        else
        {


            location.x = i-margin;
            location.y = j;
            cache[jG*widthGM] =  read_imagef(srcBuffer, sampler, location);

            location.x = i-(margin-1);
            location.y = j;
            cache[1 + jG*widthGM] =  read_imagef(srcBuffer, sampler, location);
        }
    }

    if(iG == ((widthGM-1)-(margin<<1)))
    {
        if(i == (width-1))
        {


            cache[(widthGM-2) + jG*widthGM].x = focus.w;
            cache[(widthGM-2) + jG*widthGM].y = focus.w;
            cache[(widthGM-2) + jG*widthGM].z = focus.w;
            cache[(widthGM-2) + jG*widthGM].w = focus.w;
            cache[(widthGM-1) + jG*widthGM].x = focus.w;
            cache[(widthGM-1) + jG*widthGM].y = focus.w;
            cache[(widthGM-1) + jG*widthGM].z = focus.w;
            cache[(widthGM-1) + jG*widthGM].w = focus.w;
        }
        else
        {


            location.x = i+margin;
            location.y = j;
            cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);

            location.x = i+(margin-1);
            location.y = j;
            cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
        }
    }


    barrier(CLK_LOCAL_MEM_FENCE);

    float4  bar[10],barX[10],barY[10],barZ[10],barW[10];
    float4  *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW;
    float   *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW;
    float4  gauss[4];
    float   *gf = &gauss;
    float4  acc;


    gf[0] = filter[0];
    gf[1] = filter[1];
    gf[2] = filter[2];
    gf[3] = filter[3];
    gf[4] = filter[4];
    gf[5] = filter[5];
    gf[6] = filter[6];
    gf[7] = filter[7];
    gf[8] = filter[8];
    gf[9] = filter[9];
    gf[10]= filter[10];
    gf[11]= filter[11];
    gf[12]= filter[12];
    gf[13]= filter[13];
    gf[14]= filter[14];
    gf[15]= 0.0f;


    start = iG + jG*widthGM;


    fX[0]                         = cache[start+0].y;
    fX[1] = fY[0]                 = cache[start+0].z;
    fX[2] = fY[1] = fZ[0]         = cache[start+0].w;
    fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x;
    fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y;
    fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z;
    fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w;
    fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x;
    fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y;
    fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z;
    fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w;
    fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x;
    fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y;
    fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z;
    fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w;
    fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x;
    fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y;
    fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z;
    fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f;


    acc.x =  fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15];
    acc.y =  fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15];
    acc.z =  fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15];
    acc.w =  fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15];

    location.x = i;
    location.y = j;

    write_imagef(dstBuffer, location , acc);

}

Transpose:

__kernel void TransponseKernel(__read_only image2d_t srcBuffer,
                               __constant int4 *par,
                               __local float4 *temp,
                               __write_only image2d_t dstBuffer)
{
    int width = par[0].x;
    int height = par[0].y;
    int widthG = par[0].z;
    int heightG = par[0].w;
    int gaussFilterWidth = par[1].x;
    int margin = 1 + (gaussFilterWidth>>2);

    int widthGM = widthG + (margin<<1);

    int2 location;
    float4 focus,set;

    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;

    int i = get_global_id(0);
    int j = get_global_id(1);

    int iG = get_local_id(0);
    int jG = get_local_id(1);

    location.x = i;
    location.y = j;

    focus = read_imagef(srcBuffer, sampler, location);


    temp[iG + jG*widthG] = focus;

    barrier(CLK_LOCAL_MEM_FENCE);

    int I,S;


    I = j >> 2;
    S = j & 3;

    if(S == 0)
    {
        set.x = temp[iG + (jG+0)*widthG].x;
        set.y = temp[iG + (jG+1)*widthG].x;
        set.z = temp[iG + (jG+2)*widthG].x;
        set.w = temp[iG + (jG+3)*widthG].x;
    }
    if(S == 1)
    {
        set.x = temp[iG + (jG-1)*widthG].y;
        set.y = temp[iG + (jG+0)*widthG].y;
        set.z = temp[iG + (jG+1)*widthG].y;
        set.w = temp[iG + (jG+2)*widthG].y;
    }
    if(S == 2)
    {
        set.x = temp[iG + (jG-2)*widthG].z;
        set.y = temp[iG + (jG-1)*widthG].z;
        set.z = temp[iG + (jG+0)*widthG].z;
        set.w = temp[iG + (jG+1)*widthG].z;
    }
    if(S == 3)
    {
        set.x = temp[iG + (jG-3)*widthG].w;
        set.y = temp[iG + (jG-2)*widthG].w;
        set.z = temp[iG + (jG-1)*widthG].w;
        set.w = temp[iG + (jG+0)*widthG].w;
    }


    location.x = I;
    location.y = (i*4 + S);

    write_imagef(dstBuffer, location , set);

}

The performance difference is very huge, I hope you have verified the output? — Meluha, Apr 02 '14 at 12:26
The kernels are very complex, and it's hard to point out "THE" exact reason. The first possible reason that came to my mind was that it might run out of registers for the larger kernel, but that's only a first guess. — Marco13, Apr 02 '14 at 12:58
Just a comment, I think you don't need the ifs in the first kernel. The read image will take care of the borders returning a copy of the nearest pixel of the image. Even if you read out of boundaries. — DarkZeros, Apr 02 '14 at 14:30
for Sagar: we verified the output and it is the same in both cases. — user3488738, Apr 03 '14 at 09:09
for DarkZeros: in the input image in each float4 we stored 4 different gray level pixels, so we must use if condition in order to put the right pixels (avoiding automatic padding) in the border (left -> .x , right -> .w) — user3488738, Apr 03 '14 at 09:14
for Marco13: we tried to reduce the vectors size (10 to 4) but the result it is the same. We guess that probably the issue regards the second barrier in the larger kernel. We also tried to comment the assignments below the first barrier (starting from gf[0] = filter[0]; and ending to fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f;). In this case the results is wrong but the performance is equal to one of the two separated kernel implementation. It seems that the threads are waiting too much time to join (in the second barrier) — user3488738, Apr 03 '14 at 09:24

OpenCL gaussian convolution using one Kernel is slower

0 Answers0