binDCT algorithm for 8x8 matrix

Question

I've googled about the implementation of a fast DCT. I've found the Loeffler algorithm and I have implemented in C++ and in ARM assembly with NEON. Moving ahead, I've found the binDCT that avoid floating calculation. My reference paper/schema is this one:

That said, I've tried to implement in C++ with the following code, just to test:

void my_binDCT(int in[8][8], int data[8][8],const int xpos, const int ypos)
{
    int i;
    int row[8][8];

    int x0, x1, x2, x3, x4, x5, x6, x7;
    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;

    // transform rows 
    for (i = 0; i < 8; i++) {
        x0 = in[xpos + 0][ypos + i];
        x1 = in[xpos + 1][ypos + i];
        x2 = in[xpos + 2][ypos + i];
        x3 = in[xpos + 3][ypos + i];
        x4 = in[xpos + 4][ypos + i];
        x5 = in[xpos + 5][ypos + i];
        x6 = in[xpos + 6][ypos + i];
        x7 = in[xpos + 7][ypos + i];

        //stage 1
        tmp0 = x0 + x7;
        tmp7 = x0 - x7;
        tmp1 = x1 + x6; 
        tmp6 = x1 - x6;
        tmp2 = x2 + x5;
        tmp5 = x2 - x5;
        tmp3 = x3 + x4;
        tmp4 = x3 - x4;

        //stage 2
        tmp16 = ((tmp5*3)>>3) + tmp6;
        tmp15 = ((tmp16*5)>>3) - tmp5;

        //stage 3
        tmp10 = tmp0 + tmp3;
        tmp13 = tmp0 - tmp3;
        tmp11 = tmp1 + tmp2;
        tmp12 = tmp1 - tmp2;

        tmp14 = tmp4 + tmp15;
        tmp15 = tmp4 - tmp15;

        auto z = tmp16;
        tmp16 = tmp7 - tmp16;
        tmp17 = z + tmp7;

        //stage 4
        tmp14 = (tmp17 >> 3) - tmp14;

        tmp10 = tmp10 + tmp11;
        tmp11 = (tmp10 >> 1) - tmp11;

        tmp12 = ((tmp13*3)>>3) - tmp12;
        tmp13 = ((tmp12*3)>>3) + tmp13;

        tmp15 = ((tmp16*7)>>3) + tmp15;
        tmp16 = (tmp15>>1) - tmp16;


        //stage 5
        row[i][0] = tmp10;
        row[i][4] = tmp11;
        row[i][6] = tmp12;
        row[i][2] = tmp13;
        row[i][7] = tmp14;
        row[i][5] = tmp15;
        row[i][3] = tmp16;
        row[i][1] = tmp17;
    }

    //rotate columns
    /* transform columns */
    for (i = 0; i < 8; i++) {

        x0 = row[0][i];
        x1 = row[1][i];
        x2 = row[2][i];
        x3 = row[3][i];
        x4 = row[4][i];
        x5 = row[5][i];
        x6 = row[6][i];
        x7 = row[7][i];

        //stage 1
        tmp0 = x0 + x7;
        tmp7 = x0 - x7;
        tmp1 = x1 + x6; 
        tmp6 = x1 - x6;
        tmp2 = x2 + x5;
        tmp5 = x2 - x5;
        tmp3 = x3 + x4;
        tmp4 = x3 - x4;

        //stage 2
        tmp16 = ((tmp5*3)>>3) + tmp6;
        tmp15 = ((tmp16*5)>>3) - tmp5;

        //stage 3
        tmp10 = tmp0 + tmp3;
        tmp13 = tmp0 - tmp3;
        tmp11 = tmp1 + tmp2;
        tmp12 = tmp1 - tmp2;

        tmp14 = tmp4 + tmp15;
        tmp15 = tmp4 - tmp15;

        auto z = tmp16;
        tmp16 = tmp7 - tmp16;
        tmp17 = z + tmp7;

        //stage 4
        tmp14 = (tmp17 >> 3) - tmp14;

        tmp10 = tmp10 + tmp11;
        tmp11 = (tmp10 >> 1) - tmp11;

        tmp12 = ((tmp13*3)>>3) - tmp12;
        tmp13 = ((tmp12*3)>>3) + tmp13;

        tmp15 = ((tmp16*7)>>3) + tmp15;
        tmp16 = (tmp15>>1) - tmp16;

        //stage 5
        data[0][i] = tmp10 >> 3;
        data[4][i] = tmp11 >> 3;
        data[6][i] = tmp12 >> 3;
        data[2][i] = tmp13 >> 3;
        data[7][i] = tmp14 >> 3;
        data[5][i] = tmp15 >> 3;
        data[3][i] = tmp16 >> 3;
        data[1][i] = tmp17 >> 3;
    }
}

I've coded the first DCT by rows and the second one by columns and I've supposed to normalize the results dividing by 8 (as per DCT formula with N=8).

I've tested on a 8x8 matrix:

int matrix_a[8][8] = {
                        12, 16, 19, 12, 12, 27, 51, 47,

                        16, 24, 12, 19, 12, 20, 39, 51,

                        24, 27, 8,  39, 35, 34, 24, 44,

                        40, 17, 28, 32, 24, 27, 8,  32,

                        34, 20, 28, 20, 12, 8,  19, 34,

                        19, 39, 12, 27, 27, 12, 8,  34,

                        8,  28, -5, 39, 34, 16, 12, 19,

                        20, 27, 8,  27, 24, 19, 19, 8,
};

And I got this outcome:

MYBINDCT-2: 

186 13 -3 4 -2 4 6 0 
-13 -20 -10 1 2 -2 1 -4 
1 19 -10 -3 7 -12 -2 -4 
5 2 -4 -3 -1 -4 -2 -1 
11 -5 -7 1 -3 4 -1 0 
-13 8 -3 0 10 -4 -6 3 
-11 6 -11 1 6 0 -1 -4 
-13 4 -1 -3 5 -5 -1 0

that is quite far from the (rounded) real dct:

186 20 -11 -9 -4 3 8 -1 
-18 -35 -24 -5 9 -3 0 -8 
14 26 -2 14 7 -19 -3 -3 
-9 -10 5 -15 1 8 3 1 
23 -11 -19 -9 -11 8 -2 1 
-10 10 3 -3 17 -4 -8 4 
-14 13 -21 -4 18 0 -1 -7 
-19 7 -1 8 15 -7 -3 0

I've applied the algorithm, done a lot of tests, but I still don't understand where I made mistakes.

Does anybody with much better experience than me can explain me the mistakes I've done? The strange thing is that I've implemented Loeffler,as I wrote, and it works very well. And the procedure, apart for the coefficients and the floating numbers, is quite similar (butterfly schema, floating scaled factors, normalization). I'm stuck with it. Thanks to everyone can suggest me the answer.

EDIT: A brief call is:

int main(int argc, char **argv)
{
    int MYBINDCT[8][8];
    my_binDCT(matrix_a, MYBINDCT, 0, 0);


    cout << "\nMYBINDCT: \n";
    for (int i = 0; i < 8; i++)
    {
        cout << '\n;
        for (int j = 0; j < 8; j++)
        {
            cout << MYBINDCT[i][j] << " ";
        }
    }
    return 0;
}

*I've tested on a 8x8 matrix* I do not see any code with `my_binDCT` calls. — 273K, Mar 28 '18 at 14:49
@S.M.: do you mean a call to the function as the one I have edited? — Chris, Mar 28 '18 at 15:07

score 1 · Answer 1 · answered Mar 28 '18 at 15:54

A calculation scheme that doesn't have multipliers (or has such crude ones as 3 or 5) cannot be very precise; I think your result is actually OK.

If your paper is any good, it should specify the expected precision of the results. Otherwise, 42 is a pretty universal answer to the 8x8 DCT problem, with an unspecified precision.

When doing approximations to DCT, it's pretty common to replace the definition of the DCT by something that is easier to implement. If you use DCT for image compression, then changing the definition of DCT to any transform will work, as long as you also change the IDCT (inverse transform) accordingly. For example, H.264 (the video coding standard) does this.

Dear @anatolyg, thanks for you replay. I think that an approximation is acceptable, but in this case I have a far difference from the dct. The paper is taken from the study of Mr. Tran published on IEEE SIGNAL PROCESSING LETTERS, VOL. 7, NO. 6, JUNE 2000, so I was supposed is a good paper. I'm sure I'm doing mistakes. — Chris, Mar 28 '18 at 17:35

Vladimir Poslavskiy · Answer 2 · 2022-08-25T13:55:01.343

Я думаю вы не правильно интерпретируете "-" на схеме. Там где стоит знак "-" нужно изменить его знак, а потом сложить. -A+B или A+-B => B-A или A-B

/* Chris */
void my_binDCT(int x[8])
{
    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;

        //stage 1
        tmp0 = x[0] + x[7];
        tmp7 = x[0] - x[7];
        tmp1 = x[1] + x[6]; 
        tmp6 = x[1] - x[6];
        tmp2 = x[2] + x[5];
        tmp5 = x[2] - x[5];
        tmp3 = x[3] + x[4];
        tmp4 = x[3] - x[4];

        //stage 2
        tmp16 = ((tmp5*3)>>3) + tmp6;
        tmp15 = ((tmp16*5)>>3) - tmp5;

        //stage 3
        tmp10 = tmp0 + tmp3;
        tmp13 = tmp0 - tmp3;
        tmp11 = tmp1 + tmp2;
        tmp12 = tmp1 - tmp2;

        tmp14 = tmp4 + tmp15;
        tmp15 = tmp4 - tmp15;

        int z = tmp16;
        tmp16 = tmp7 - tmp16;
        tmp17 = (z + tmp7);

        //stage 4
        tmp14 = tmp14 - (tmp17 >> 3); //fix A+-B (tmp17 >> 3) - tmp14

        tmp10 = tmp10 + tmp11;
        tmp11 = (tmp10 >> 1) - tmp11;

        tmp12 = tmp12 - ((tmp13*3)>>3); //fix A+-B  ((tmp13*3)>>3) - tmp12;
        tmp13 = ((tmp12*3)>>3) + tmp13;

        tmp15 = (((tmp16*7)>>3) + tmp15); 
        tmp16 = tmp16 - (tmp15>>1); //fix A+-B (tmp15>>1) - tmp16


        //stage 5
        x[0] = tmp10;
        x[4] = tmp11;
        x[6] = tmp12;
        x[2] = tmp13;
        x[7] = tmp14;
        x[5] = tmp15;
        x[3] = tmp16;
        x[1] = tmp17;
}


  186    28   -14   -10    -4     3     4     0
  -27   -66   -43    -9    13    -3     0    -3
   18    47    -4    22    10   -19    -2    -1
   -9   -15     9   -20     1     7     2     0
   23   -16   -24   -10   -11     6    -1     0
   -8    11     3    -3    13    -3    -3     1
   -8    10   -15    -2     9     0    -1    -1
   -5     2    -1     3     4    -2    -1     0

-----------
  186    13    -7    -5    -2     4    -7     0
  -13   -20   -11    -2     2    -2    -2     3
    9    14    -1     4     2   -12     1     0
   -6    -4     2    -3     0     3    -2    -1
   11    -5    -6    -2    -3     4     0    -1
  -12     8     1    -2    10    -4     6    -3
   11    -7    10     2    -7    -1    -1    -4
   12    -5     0    -3    -5     5    -1    -1

  row_fdct  my_binDCT
---------- ----------
  72796704   72545773 (rows per second)

Посмотрите на intDCT (row_fdct). На x86 нет никакого прироста производительности! использовать binDCT имеет смысл только в оборудовании, которое не умеет умножать или которая экономит энергию.

#define FIX_0_382683433 98
#define FIX_0_541196100 139 
#define FIX_0_707106781 181 
#define FIX_1_306562965 334

void row_fdct(int dataptr[]){
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1, z2, z3, z4, z5, z11, z13;

  /* Pass 1: process rows. */

    tmp0 = dataptr[0] + dataptr[7];
    tmp7 = dataptr[0] - dataptr[7];
    tmp1 = dataptr[1] + dataptr[6];
    tmp6 = dataptr[1] - dataptr[6];
    tmp2 = dataptr[2] + dataptr[5];
    tmp5 = dataptr[2] - dataptr[5];
    tmp3 = dataptr[3] + dataptr[4];
    tmp4 = dataptr[3] - dataptr[4];

    /* Even part */

    tmp10 = tmp0 + tmp3;        /* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

    dataptr[0] = tmp10 + tmp11; /* phase 3 */
    dataptr[4] = tmp10 - tmp11;

    z1 = (tmp12 + tmp13) * FIX_0_707106781 >> 8; /* c4 */
    dataptr[2] = tmp13 + z1;    /* phase 5 */
    dataptr[6] = tmp13 - z1;

    /* Odd part */

    tmp10 = tmp4 + tmp5;        /* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
    z5 = (tmp10 - tmp12) * FIX_0_382683433 >> 8; /* c6 */
    z2 = (tmp10 * FIX_0_541196100 >> 8) + z5;    /* c2-c6 */
    z4 = (tmp12 * FIX_1_306562965 >> 8) + z5;    /* c2+c6 */
    z3 = tmp11 * FIX_0_707106781 >> 8;         /* c4 */

    z11 = tmp7 + z3;            /* phase 5 */
    z13 = tmp7 - z3;

    dataptr[5] = z13 + z2;      /* phase 6 */
    dataptr[3] = z13 - z2;
    dataptr[1] = z11 + z4;
    dataptr[7] = z11 - z4;

}

я погуглил по поводу binDCT и нашёл ещё документ, где есть схема binDCT C7. Я поиграл с ней и подогнал выходные умножения, чтобы приблизить результаты к каноническому fastDCT (но я всё-же буду использовать intDCT вместо binDCT):

void row_bdct_c7_scale(int dataptr[8]){
   int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,z1;
   tmp0 = dataptr[0] + dataptr[7];
   tmp7 = dataptr[0] - dataptr[7];
   tmp1 = dataptr[1] + dataptr[6];
   tmp6 = dataptr[1] - dataptr[6];
   tmp2 = dataptr[2] + dataptr[5];
   tmp5 = dataptr[2] - dataptr[5];
   tmp3 = dataptr[3] + dataptr[4];
   tmp4 = dataptr[3] - dataptr[4];
   tmp5 = tmp5 - tmp6/2;
   tmp6 = tmp5*3/4 + tmp6;
   tmp5 = tmp6/2   - tmp5;
   tmp0 = (z1=tmp0) + tmp3;
   tmp3 = z1-tmp3;
   tmp1 = (z1=tmp1) + tmp2;
   tmp2 = z1-tmp2;
   dataptr[0] = tmp0 = tmp0+tmp1;
   dataptr[4] = (tmp0/2 - tmp1)*2;
   dataptr[6] = tmp2 = tmp3/2-tmp2;
   dataptr[2] = (tmp3 - tmp2/2)*2;
   tmp4 = (z1=tmp4)+tmp5;
   tmp5 = z1-tmp5;
   tmp6 = tmp7 - (z1=tmp6);
   tmp7 = tmp7 + z1;
   dataptr[7] = tmp4 = (tmp7/4-tmp4)>>1;
   dataptr[1] = (tmp7 - tmp4/4)*2; //scale x2
   dataptr[5] = tmp5 = tmp6 + tmp5;
   dataptr[3] = (tmp6 - tmp5/2)*2; //scale x2
}

  186    28   -14   -10    -4     3     4     0
  -27   -66   -43    -9    13    -3     0    -3
   18    47    -4    22    10   -19    -2    -1
   -9   -15     9   -20     1     7     2     0
   23   -16   -24   -10   -11     6    -1     0
   -8    11     3    -3    13    -3    -3     1
   -8    10   -15    -2     9     0    -1    -1
   -5     2    -1     3     4    -2    -1     0

-----------
  186    28   -16    -8    -4     1     6    -1
  -27   -63   -41    -7    16    -5    -2    -4
   21    38     4    25     5   -19    -3    -1
   -7   -18     4   -16    -3     6     4     0
   22   -14   -23   -11   -11     6    -2     0
  -11    13     8    -6    17    -3    -5     1
  -11    15   -21    -1    15    -1    -2    -3
   -8     4    -1     3     5    -2    -1     0

  row_fdct row_bdct_c
---------- ----------
  72404388   62906263 (rows per second)

binDCT c7 8x8 version : 17m blocks/sec https://godbolt.org/z/xnvc4qjvK — Vladimir Poslavskiy, Aug 29 '22 at 10:18

binDCT algorithm for 8x8 matrix

2 Answers2