I've googled about the implementation of a fast DCT. I've found the Loeffler algorithm and I have implemented in C++ and in ARM assembly with NEON. Moving ahead, I've found the binDCT that avoid floating calculation. My reference paper/schema is this one:
That said, I've tried to implement in C++ with the following code, just to test:
void my_binDCT(int in[8][8], int data[8][8],const int xpos, const int ypos)
{
int i;
int row[8][8];
int x0, x1, x2, x3, x4, x5, x6, x7;
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
// transform rows
for (i = 0; i < 8; i++) {
x0 = in[xpos + 0][ypos + i];
x1 = in[xpos + 1][ypos + i];
x2 = in[xpos + 2][ypos + i];
x3 = in[xpos + 3][ypos + i];
x4 = in[xpos + 4][ypos + i];
x5 = in[xpos + 5][ypos + i];
x6 = in[xpos + 6][ypos + i];
x7 = in[xpos + 7][ypos + i];
//stage 1
tmp0 = x0 + x7;
tmp7 = x0 - x7;
tmp1 = x1 + x6;
tmp6 = x1 - x6;
tmp2 = x2 + x5;
tmp5 = x2 - x5;
tmp3 = x3 + x4;
tmp4 = x3 - x4;
//stage 2
tmp16 = ((tmp5*3)>>3) + tmp6;
tmp15 = ((tmp16*5)>>3) - tmp5;
//stage 3
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
tmp14 = tmp4 + tmp15;
tmp15 = tmp4 - tmp15;
auto z = tmp16;
tmp16 = tmp7 - tmp16;
tmp17 = z + tmp7;
//stage 4
tmp14 = (tmp17 >> 3) - tmp14;
tmp10 = tmp10 + tmp11;
tmp11 = (tmp10 >> 1) - tmp11;
tmp12 = ((tmp13*3)>>3) - tmp12;
tmp13 = ((tmp12*3)>>3) + tmp13;
tmp15 = ((tmp16*7)>>3) + tmp15;
tmp16 = (tmp15>>1) - tmp16;
//stage 5
row[i][0] = tmp10;
row[i][4] = tmp11;
row[i][6] = tmp12;
row[i][2] = tmp13;
row[i][7] = tmp14;
row[i][5] = tmp15;
row[i][3] = tmp16;
row[i][1] = tmp17;
}
//rotate columns
/* transform columns */
for (i = 0; i < 8; i++) {
x0 = row[0][i];
x1 = row[1][i];
x2 = row[2][i];
x3 = row[3][i];
x4 = row[4][i];
x5 = row[5][i];
x6 = row[6][i];
x7 = row[7][i];
//stage 1
tmp0 = x0 + x7;
tmp7 = x0 - x7;
tmp1 = x1 + x6;
tmp6 = x1 - x6;
tmp2 = x2 + x5;
tmp5 = x2 - x5;
tmp3 = x3 + x4;
tmp4 = x3 - x4;
//stage 2
tmp16 = ((tmp5*3)>>3) + tmp6;
tmp15 = ((tmp16*5)>>3) - tmp5;
//stage 3
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
tmp14 = tmp4 + tmp15;
tmp15 = tmp4 - tmp15;
auto z = tmp16;
tmp16 = tmp7 - tmp16;
tmp17 = z + tmp7;
//stage 4
tmp14 = (tmp17 >> 3) - tmp14;
tmp10 = tmp10 + tmp11;
tmp11 = (tmp10 >> 1) - tmp11;
tmp12 = ((tmp13*3)>>3) - tmp12;
tmp13 = ((tmp12*3)>>3) + tmp13;
tmp15 = ((tmp16*7)>>3) + tmp15;
tmp16 = (tmp15>>1) - tmp16;
//stage 5
data[0][i] = tmp10 >> 3;
data[4][i] = tmp11 >> 3;
data[6][i] = tmp12 >> 3;
data[2][i] = tmp13 >> 3;
data[7][i] = tmp14 >> 3;
data[5][i] = tmp15 >> 3;
data[3][i] = tmp16 >> 3;
data[1][i] = tmp17 >> 3;
}
}
I've coded the first DCT by rows and the second one by columns and I've supposed to normalize the results dividing by 8 (as per DCT formula with N=8).
I've tested on a 8x8 matrix:
int matrix_a[8][8] = {
12, 16, 19, 12, 12, 27, 51, 47,
16, 24, 12, 19, 12, 20, 39, 51,
24, 27, 8, 39, 35, 34, 24, 44,
40, 17, 28, 32, 24, 27, 8, 32,
34, 20, 28, 20, 12, 8, 19, 34,
19, 39, 12, 27, 27, 12, 8, 34,
8, 28, -5, 39, 34, 16, 12, 19,
20, 27, 8, 27, 24, 19, 19, 8,
};
And I got this outcome:
MYBINDCT-2:
186 13 -3 4 -2 4 6 0
-13 -20 -10 1 2 -2 1 -4
1 19 -10 -3 7 -12 -2 -4
5 2 -4 -3 -1 -4 -2 -1
11 -5 -7 1 -3 4 -1 0
-13 8 -3 0 10 -4 -6 3
-11 6 -11 1 6 0 -1 -4
-13 4 -1 -3 5 -5 -1 0
that is quite far from the (rounded) real dct:
186 20 -11 -9 -4 3 8 -1
-18 -35 -24 -5 9 -3 0 -8
14 26 -2 14 7 -19 -3 -3
-9 -10 5 -15 1 8 3 1
23 -11 -19 -9 -11 8 -2 1
-10 10 3 -3 17 -4 -8 4
-14 13 -21 -4 18 0 -1 -7
-19 7 -1 8 15 -7 -3 0
I've applied the algorithm, done a lot of tests, but I still don't understand where I made mistakes.
Does anybody with much better experience than me can explain me the mistakes I've done? The strange thing is that I've implemented Loeffler,as I wrote, and it works very well. And the procedure, apart for the coefficients and the floating numbers, is quite similar (butterfly schema, floating scaled factors, normalization). I'm stuck with it. Thanks to everyone can suggest me the answer.
EDIT: A brief call is:
int main(int argc, char **argv)
{
int MYBINDCT[8][8];
my_binDCT(matrix_a, MYBINDCT, 0, 0);
cout << "\nMYBINDCT: \n";
for (int i = 0; i < 8; i++)
{
cout << '\n;
for (int j = 0; j < 8; j++)
{
cout << MYBINDCT[i][j] << " ";
}
}
return 0;
}