First I want to provide you with some context.
I have two kind of images I need to merge. The first image is the background image with the format 8BppGrey and a resolution of 320x240. The second image is the forground image with the format 32BppRGBA and a resolution of 64x48.
Update The github repo with an MVP is at the bottom of the question.
To do it I resize the second image with bilinear interpolation to the same size as the first one and then use blending to merge both to one image. Blending only happens when the alpha value of the second image is greater then 0.
I need to do it as fast as possible so my idea was to combine the resize and merge / blend process.
To achieve this I used the resize function from the writeablebitmapex repository and added merging / blending.
Everything works as expected but I want to decrease the execution time.
This are the current debug timings:
// CPU: Intel(R) Core(TM) i7-4810MQ CPU @ 2.80GHz
MediaServer: Execution time in c++ 5 ms
MediaServer: Resizing took 4 ms.
MediaServer: Execution time in c++ 5 ms
MediaServer: Resizing took 5 ms.
MediaServer: Execution time in c++ 4 ms
MediaServer: Resizing took 4 ms.
MediaServer: Execution time in c++ 3 ms
MediaServer: Resizing took 3 ms.
MediaServer: Execution time in c++ 4 ms
MediaServer: Resizing took 4 ms.
MediaServer: Execution time in c++ 5 ms
MediaServer: Resizing took 4 ms.
MediaServer: Execution time in c++ 6 ms
MediaServer: Resizing took 6 ms.
MediaServer: Execution time in c++ 3 ms
MediaServer: Resizing took 3 ms.
Do I have any chance to increase the performance and lower the execution time of the resize / merge / blend process?
Are there some parts I maybe can parallelize?
Do I maybe have a chance to use some processor features?
A huge performance hit is the nested loop but I have no idea how I could write it better.
I would like to reach 1 or 2 ms for the whole process. Is this even possible?
Here's the modified visual c++ function I use.
- pd is the backbuffer of the writeable bitmap I use to display the result in wpf. The format I use is the default 32BppRGBA.
- pixels is the int[] array of the 64x48 32BppRGBA image
- widthSource and heightSource is the size of the pixels image
- width and height is the target size of the output image
- baseImage is the int[] array of the 320x240 8BppGray image
VC++ code:
unsigned int Resize(int* pd, int* pixels, int widthSource, int heightSource, int width, int height, byte* baseImage)
{
unsigned int start = clock();
float xs = (float)widthSource / width;
float ys = (float)heightSource / height;
float fracx, fracy, ifracx, ifracy, sx, sy, l0, l1, rf, gf, bf;
int c, x0, x1, y0, y1;
byte c1a, c1r, c1g, c1b, c2a, c2r, c2g, c2b, c3a, c3r, c3g, c3b, c4a, c4r, c4g, c4b;
byte a, r, g, b;
// Bilinear
int srcIdx = 0;
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
sx = x * xs;
sy = y * ys;
x0 = (int)sx;
y0 = (int)sy;
// Calculate coordinates of the 4 interpolation points
fracx = sx - x0;
fracy = sy - y0;
ifracx = 1.0f - fracx;
ifracy = 1.0f - fracy;
x1 = x0 + 1;
if (x1 >= widthSource)
{
x1 = x0;
}
y1 = y0 + 1;
if (y1 >= heightSource)
{
y1 = y0;
}
// Read source color
c = pixels[y0 * widthSource + x0];
c1a = (byte)(c >> 24);
c1r = (byte)(c >> 16);
c1g = (byte)(c >> 8);
c1b = (byte)(c);
c = pixels[y0 * widthSource + x1];
c2a = (byte)(c >> 24);
c2r = (byte)(c >> 16);
c2g = (byte)(c >> 8);
c2b = (byte)(c);
c = pixels[y1 * widthSource + x0];
c3a = (byte)(c >> 24);
c3r = (byte)(c >> 16);
c3g = (byte)(c >> 8);
c3b = (byte)(c);
c = pixels[y1 * widthSource + x1];
c4a = (byte)(c >> 24);
c4r = (byte)(c >> 16);
c4g = (byte)(c >> 8);
c4b = (byte)(c);
// Calculate colors
// Alpha
l0 = ifracx * c1a + fracx * c2a;
l1 = ifracx * c3a + fracx * c4a;
a = (byte)(ifracy * l0 + fracy * l1);
// Write destination
if (a > 0)
{
// Red
l0 = ifracx * c1r + fracx * c2r;
l1 = ifracx * c3r + fracx * c4r;
rf = ifracy * l0 + fracy * l1;
// Green
l0 = ifracx * c1g + fracx * c2g;
l1 = ifracx * c3g + fracx * c4g;
gf = ifracy * l0 + fracy * l1;
// Blue
l0 = ifracx * c1b + fracx * c2b;
l1 = ifracx * c3b + fracx * c4b;
bf = ifracy * l0 + fracy * l1;
// Cast to byte
float alpha = a / 255.0f;
r = (byte)((rf * alpha) + (baseImage[srcIdx] * (1.0f - alpha)));
g = (byte)((gf * alpha) + (baseImage[srcIdx] * (1.0f - alpha)));
b = (byte)((bf * alpha) + (baseImage[srcIdx] * (1.0f - alpha)));
pd[srcIdx++] = (255 << 24) | (r << 16) | (g << 8) | b;
}
else
{
// Alpha, Red, Green, Blue
pd[srcIdx++] = (255 << 24) | (baseImage[srcIdx] << 16) | (baseImage[srcIdx] << 8) | baseImage[srcIdx];
}
}
}
unsigned int end = clock() - start;
return end;
}