I originally have this function, and I am trying to optimize it further using loop unrolling - which I am having a trouble with - flipping the for loops increase the efficiency, as well as getting the calls outside the loops. However, when it comes to applying loop unrolling as I did, it misses up what the function should be doing.
double
applyFilter(class Filter *filter, cs1300bmp *input, cs1300bmp *output)
{
long long cycStart, cycStop;
cycStart = rdtscll();
output -> width = input -> width;
output -> height = input -> height;
for(int col = 1; col < (input -> width) - 1; col = col + 1) {
for(int row = 1; row < (input -> height) - 1 ; row = row + 1) {
for(int plane = 0; plane < 3; plane++) {
output -> color[plane][row][col] = 0;
for (int j = 0; j < filter -> getSize(); j++) {
for (int i = 0; i < filter -> getSize(); i++) {
output -> color[plane][row][col]
= output -> color[plane][row][col]
+ (input -> color[plane][row + i - 1][col + j - 1]
* filter -> get(i, j) );
}
}
output -> color[plane][row][col] =
output -> color[plane][row][col] / filter -> getDivisor();
if ( output -> color[plane][row][col] < 0 ) {
output -> color[plane][row][col] = 0;
}
if ( output -> color[plane][row][col] > 255 ) {
output -> color[plane][row][col] = 255;
}
}
}
}
cycStop = rdtscll();
double diff = cycStop - cycStart;
double diffPerPixel = diff / (output -> width * output -> height);
fprintf(stderr, "Took %f cycles to process, or %f cycles per pixel\n",
diff, diff / (output -> width * output -> height));
return diffPerPixel;
}
This is where I arrive at, but it doesn't seem to be working. I would appreciate an explanation of what I am doing wrong in the loop unrolling part.
double applyFilter(class Filter *filter, cs1300bmp *input, cs1300bmp *output){
long long cycStart, cycStop;
cycStart = rdtscll();
//start
output -> width = input -> width;
output -> height = input -> height;
//function calls outside loop.
int filterSize = filter -> getSize();
int divisor = filter -> getDivisor();
//intializaions
int inputHlen = input -> height - 1;
int inputWlen = input -> width - 1;
// loop unrolling row + k - 1 , col + k - 1
for(int plane = 0; plane < 3; plane++){
for(int row = 1; row + 3 < inputHlen; row += 4){
for(int col = 1; col +3 < inputWlen; col += 4){
output -> color[plane][row][col] = 0;
output -> color[plane][row + 1][col + 1] = 0;
output -> color[plane][row + 2][col + 2] = 0;
output -> color[plane][row + 3][col + 3] = 0;
int acc1 = output -> color[plane][row][col];
int acc2 = output -> color[plane][row + 1][col + 1];
int acc3 = output -> color[plane][row + 2][col + 2];
int acc4 = output -> color[plane][row + 3][col + 3];
for (int j = 0; j + 3 < filterSize; j += 4) {
for (int i = 0; i + 3 < filterSize; i += 4){
acc1 = acc1 + (input -> color[plane][row + i - 1][col + j - 1] * filter -> get(i, j) );
acc2 = acc2 + (input -> color[plane][row + i][col + j] * filter -> get(i + 1, j + 1) );
acc3 = acc3 + (input -> color[plane][row + i + 1][col + j + 1] * filter -> get(i +2, j + 2) );
acc4 = acc4 + (input -> color[plane][row + i + 2][col + j + 2] * filter -> get(i + 3, j + 3) );
}
}
acc1 = acc1 / divisor;
acc2 = acc2 / divisor;
acc3 = acc3 / divisor;
acc4 = acc4 / divisor;
acc1 = (acc1 < 0) ? 0 : acc1;
acc1 = (acc1 > 255) ? 255 : acc1;
acc2 = (acc1 < 0) ? 0 : acc1;
acc2 = (acc1 > 255) ? 255 : acc1;
acc3 = (acc1 < 0) ? 0 : acc1;
acc3 = (acc1 > 255) ? 255 : acc1;
acc4 = (acc1 < 0) ? 0 : acc1;
acc4 = (acc1 > 255) ? 255 : acc1;
output -> color[plane][row][col] = acc1;
output -> color[plane][row + 1][col + 1] = acc2;
output -> color[plane][row + 2][col + 2] = acc3;
output -> color[plane][row + 3][col + 3] = acc4;
}
}
}
//end
cycStop = rdtscll();
double diff = cycStop - cycStart;
double diffPerPixel = diff / (output -> width * output -> height);
fprintf(stderr, "Took %f cycles to process, or %f cycles per pixel\n",
diff, diff / (output -> width * output -> height));
return diffPerPixel;
}