I am trying to perform fast binning of a 2D image (stored in row-major order in 1D array). The image is 12 bit, Thus, I have used the uint16_t data type.
What binning means is if done 2x2, then 2 pixels in each direction (x and y) becomes one pixel, or essentially the one super pixel contains the mean of those 4 pixels.
I have written the following code to calculate the mean of pixels and store them in superpixel while avoiding integer overflow. However, The divide by "DIVISIONFACTOR" to avoid the overflow slows the function down as the code has to perform more divisions (which are expensive compared to other arithmetic operations). The code runs significantly faster if I add values of all pixels (thus potential overflow can happen if the sum of pixels exceeds 65535) and then divide the sum for a superpixel by "DIVISIONFACTOR" only once.
To be more precise, The code runs at 0.75 ms with the overflow-avoiding code and at 0.13 ms with code that has the potential for overflow (owing to fewer division operations).
Please suggest ways to optimize further. I am using intel C++ compiler, please don't shy away from suggesting any Intel specific technique.
Thank you in advance.
Regards,
Harsh
Image Definitions: WIDTH and HEIGHT are actual image dimensions received from Camera and NX and NY are final dimensions.
#define NX 256
#define NY 256
#define WIDTH 768
#define HEIGHT 768
#define BINNINGFACTORWIDTH (WIDTH / NX)
#define BINNINGFACTORHEIGHT (HEIGHT / NY)
#define DIVISIONFACTOR (BINNINGFACTORWIDTH * BINNINGFACTORHEIGHT)
Code without overflow:
int binning(uint16_t* image, uint16_t* binnedImage){
int i, j, k, l;
for (j=0; j< NY; j++) {
for (i=0;i < NX; i++) {
binnedImage[j * NX + i] = 0;
for (k=i * BINNINGFACTORWIDTH; k<i * BINNINGFACTORWIDTH + BINNINGFACTORWIDTH; k++) {
for (l=j * BINNINGFACTORHEIGHT; l<j * BINNINGFACTORHEIGHT + BINNINGFACTORHEIGHT; l++) {
binnedImage[j * NX + i] += (image[l * HEIGHT + k] / DIVISIONFACTOR);
}
}
}
}
return 0;
}
Code with potential of overflow:
int binning(uint16_t* image, uint16_t* binnedImage){
int i, j, k, l;
for (j=0; j< NY; j++) {
for (i=0;i < NX; i++) {
binnedImage[j * NX + i] = 0;
for (k=i * BINNINGFACTORWIDTH; k<i * BINNINGFACTORWIDTH + BINNINGFACTORWIDTH; k++) {
for (l=j * BINNINGFACTORHEIGHT; l<j * BINNINGFACTORHEIGHT + BINNINGFACTORHEIGHT; l++) {
binnedImage[j * NX + i] += image[l * HEIGHT + k];
}
}
binnedImage[j * NX + i] /= DIVISIONFACTOR;
}
}
return 0;
}
Full executable code:
#include <iostream>
#include <chrono>
#define NX 256
#define NY 256
#define WIDTH 768
#define HEIGHT 768
#define BINNINGFACTORWIDTH (WIDTH / NX)
#define BINNINGFACTORHEIGHT (HEIGHT / NY)
#define DIVISIONFACTOR (BINNINGFACTORWIDTH * BINNINGFACTORHEIGHT)
using namespace std;
int binning(uint16_t* image, uint16_t* binnedImage);
int main() {
chrono::high_resolution_clock::time_point t0, t1;
chrono::duration<double> dt;
double totalTime = 0;
uint64_t count = 0;
uint32_t i;
uint16_t *image = (uint16_t*) malloc(sizeof(uint16_t) * WIDTH * HEIGHT);
for (i=0; i < WIDTH * HEIGHT; i++) {
image[i] = 4095;
}
uint16_t *binnedIimage = (uint16_t*) calloc(sizeof(uint16_t), NX * NY);
while(1) {
t0 = chrono::high_resolution_clock::now();
binning(image, binnedIimage);
t1 = chrono::high_resolution_clock::now();
dt = chrono::duration_cast<chrono::duration<double>>(t1 - t0);
totalTime += dt.count();
count += 1;
if (count % 3000 == 0) {
cout<<"Time (ms): "<< totalTime * 1000 / count<<"\r";
cout.flush();
}
}
return 0;
}
int binning(uint16_t* image, uint16_t* binnedImage){
int i, j, k, l;
for (j=0; j< NY; j++) {
for (i=0;i < NX; i++) {
binnedImage[j * NX + i] = 0;
for (k=i * BINNINGFACTORWIDTH; k<i * BINNINGFACTORWIDTH + BINNINGFACTORWIDTH; k++) {
for (l=j * BINNINGFACTORHEIGHT; l<j * BINNINGFACTORHEIGHT + BINNINGFACTORHEIGHT; l++) {
binnedImage[j * NX + i] += (image[l * HEIGHT + k]/ DIVISIONFACTOR);
}
}
}
}
return 0;
}