23

I am using c++ , I want to do alpha blend using the following code.

#define CLAMPTOBYTE(color) \
    if ((color) & (~255)) { \
        color = (BYTE)((-(color)) >> 31); \
    } else { \
        color = (BYTE)(color); \
    }
#define GET_BYTE(accessPixel, x, y, scanline, bpp) \
    ((BYTE*)((accessPixel) + (y) * (scanline) + (x) * (bpp))) 

    for (int y = top ; y < bottom; ++y)
    {
        BYTE* resultByte = GET_BYTE(resultBits, left, y, stride, bytepp);
        BYTE* srcByte = GET_BYTE(srcBits, left, y, stride, bytepp);
        BYTE* srcByteTop = GET_BYTE(srcBitsTop, left, y, stride, bytepp);
        BYTE* maskCurrent = GET_GREY(maskSrc, left, y, width);
        int alpha = 0;
        int red = 0;
        int green = 0;
        int blue = 0;
        for (int x = left; x < right; ++x)
        {
            alpha = *maskCurrent;
            red = (srcByteTop[R] * alpha + srcByte[R] * (255 - alpha)) / 255;
            green = (srcByteTop[G] * alpha + srcByte[G] * (255 - alpha)) / 255;
            blue = (srcByteTop[B] * alpha + srcByte[B] * (255 - alpha)) / 255;
            CLAMPTOBYTE(red);
            CLAMPTOBYTE(green);
            CLAMPTOBYTE(blue);
            resultByte[R] = red;
            resultByte[G] = green;
            resultByte[B] = blue;
            srcByte += bytepp;
            srcByteTop += bytepp;
            resultByte += bytepp;
            ++maskCurrent;
        }
    }

however I find it is still slow, it takes about 40 - 60 ms when compose two 600 * 600 image. Is there any method to improve the speed to less then 16ms?

Can any body help me to speed this code? Many thanks!

ideasman42
  • 42,413
  • 44
  • 197
  • 320
user25749
  • 4,825
  • 14
  • 61
  • 83
  • What compiler are you using? What platform are you developing this software for? Are you willing to use off the shelf tools? – Tom Leys Jul 09 '09 at 23:36
  • I am using VS2005, the software is designed for windows platform. I am willing to use any method to accelerate this code. I think maybe it can be accelerated alot – user25749 Jul 10 '09 at 02:13
  • Let me know if you have trouble coding up the rest of the SIMD instructions in my solution – Tom Leys Jul 11 '09 at 21:40

17 Answers17

28

Use SSE - start around page 131.

The basic workflow

  1. Load 4 pixels from src (16 1 byte numbers) RGBA RGBA RGBA RGBA (streaming load)

  2. Load 4 more which you want to blend with srcbytetop RGBx RGBx RGBx RGBx

  3. Do some swizzling so that the A term in 1 fills every slot I.e

    xxxA xxxB xxxC xxxD -> AAAA BBBB CCCC DDDD

    In my solution below I opted instead to re-use your existing "maskcurrent" array but having alpha integrated into the "A" field of 1 will require less loads from memory and thus be faster. Swizzling in this case would probably be: And with mask to select A, B, C, D. Shift right 8, Or with origional, shift right 16, or again.

  4. Add the above to a vector that is all -255 in every slot

  5. Multiply 1 * 4 (source with 255-alpha) and 2 * 3 (result with alpha).

    You should be able to use the "multiply and discard bottom 8 bits" SSE2 instruction for this.

  6. add those two (4 and 5) together

  7. Store those somewhere else (if possible) or on top of your destination (if you must)

Here is a starting point for you:

    //Define your image with __declspec(align(16)) i.e char __declspec(align(16)) image[640*480]
    // so the first byte is aligned correctly for SIMD.
    // Stride must be a multiple of 16.

    for (int y = top ; y < bottom; ++y)
    {
        BYTE* resultByte = GET_BYTE(resultBits, left, y, stride, bytepp);
        BYTE* srcByte = GET_BYTE(srcBits, left, y, stride, bytepp);
        BYTE* srcByteTop = GET_BYTE(srcBitsTop, left, y, stride, bytepp);
        BYTE* maskCurrent = GET_GREY(maskSrc, left, y, width);
        for (int x = left; x < right; x += 4)
        {
            //If you can't align, use _mm_loadu_si128()
            // Step 1
            __mm128i src = _mm_load_si128(reinterpret_cast<__mm128i*>(srcByte)) 
            // Step 2
            __mm128i srcTop = _mm_load_si128(reinterpret_cast<__mm128i*>(srcByteTop)) 

            // Step 3
            // Fill the 4 positions for the first pixel with maskCurrent[0], etc
            // Could do better with shifts and so on, but this is clear
            __mm128i mask = _mm_set_epi8(maskCurrent[0],maskCurrent[0],maskCurrent[0],maskCurrent[0],
                                        maskCurrent[1],maskCurrent[1],maskCurrent[1],maskCurrent[1],
                                        maskCurrent[2],maskCurrent[2],maskCurrent[2],maskCurrent[2],
                                        maskCurrent[3],maskCurrent[3],maskCurrent[3],maskCurrent[3],
                                        ) 

            // step 4
            __mm128i maskInv = _mm_subs_epu8(_mm_set1_epu8(255), mask) 

            //Todo : Multiply, with saturate - find correct instructions for 4..6
            //note you can use Multiply and add _mm_madd_epi16

            alpha = *maskCurrent;
            red = (srcByteTop[R] * alpha + srcByte[R] * (255 - alpha)) / 255;
            green = (srcByteTop[G] * alpha + srcByte[G] * (255 - alpha)) / 255;
            blue = (srcByteTop[B] * alpha + srcByte[B] * (255 - alpha)) / 255;
            CLAMPTOBYTE(red);
            CLAMPTOBYTE(green);
            CLAMPTOBYTE(blue);
            resultByte[R] = red;
            resultByte[G] = green;
            resultByte[B] = blue;
            //----

            // Step 7 - store result.
            //Store aligned if output is aligned on 16 byte boundrary
            _mm_store_si128(reinterpret_cast<__mm128i*>(resultByte), result)
            //Slow version if you can't guarantee alignment
            //_mm_storeu_si128(reinterpret_cast<__mm128i*>(resultByte), result)

            //Move pointers forward 4 places
            srcByte += bytepp * 4;
            srcByteTop += bytepp * 4;
            resultByte += bytepp * 4;
            maskCurrent += 4;
        }
    }

To find out which AMD processors will run this code (currently it is using SSE2 instructions) see Wikipedia's List of AMD Turion microprocessors. You could also look at other lists of processors on Wikipedia but my research shows that AMD cpus from around 4 years ago all support at least SSE2.

You should expect a good SSE2 implimentation to run around 8-16 times faster than your current code. That is because we eliminate branches in the loop, process 4 pixels (or 12 channels) at once and improve cache performance by using streaming instructions. As an alternative to SSE, you could probably make your existing code run much faster by eliminating the if checks you are using for saturation. Beyond that I would need to run a profiler on your workload.

Of course, the best solution is to use hardware support (i.e code your problem up in DirectX) and have it done on the video card.

Tom Leys
  • 18,473
  • 7
  • 40
  • 62
  • See edits to my origional post to address your question. Short answer - yes if not an ancient CPU. – Tom Leys Jul 09 '09 at 23:23
  • Will it work only on Windows or on other platforms as well ? (If I will define BYTe ofc) Main question is: are SIMD instructions crossplatform ? – Coldsteel48 Aug 18 '17 at 01:28
  • Yes, SIMD instructions require the CPU support them but they don't care about the OS (Windows, etc). The compiler also needs to translate the intrinsics (such as `_mm_set_epi8`) but I believe that GCC can do this. – Tom Leys Aug 21 '17 at 08:03
24

You can always calculate the alpha of red and blue at the same time. You can also use this trick with the SIMD implementation mentioned before.

unsigned int blendPreMulAlpha(unsigned int colora, unsigned int colorb, unsigned int alpha)
{
    unsigned int rb = (colora & 0xFF00FF) + ( (alpha * (colorb & 0xFF00FF)) >> 8 );
    unsigned int g = (colora & 0x00FF00) + ( (alpha * (colorb & 0x00FF00)) >> 8 );
    return (rb & 0xFF00FF) + (g & 0x00FF00);
}


unsigned int blendAlpha(unsigned int colora, unsigned int colorb, unsigned int alpha)
{
    unsigned int rb1 = ((0x100 - alpha) * (colora & 0xFF00FF)) >> 8;
    unsigned int rb2 = (alpha * (colorb & 0xFF00FF)) >> 8;
    unsigned int g1  = ((0x100 - alpha) * (colora & 0x00FF00)) >> 8;
    unsigned int g2  = (alpha * (colorb & 0x00FF00)) >> 8;
    return ((rb1 | rb2) & 0xFF00FF) + ((g1 | g2) & 0x00FF00);
}

0 <= alpha <= 0x100

ebk
  • 586
  • 5
  • 11
Jasper Bekkers
  • 6,711
  • 32
  • 46
  • Nice trick. You should add handling of saturation in there too (right now it overflows) – Tom Leys Jul 09 '09 at 23:29
  • The overflow is intentional, it's handled in the return statement. – Jasper Bekkers Jul 10 '09 at 00:41
  • It's got a rather rude handling of overflow: wraparound instead of saturation. – MSalters Jul 10 '09 at 08:55
  • @MSalters, could be because of the hangover, but I don't see the overflow; or well, I see an intentional overflow in rb and g, but they're masked out in the return statement. (As long as int is 32 bits). – Jasper Bekkers Jul 10 '09 at 11:57
  • @JasperBekkers, did you actually try your example? With alpha=0xff (opaque), the result is 0xff80 (the red completely disappears, other colours wrong as well). – Omri Barel Aug 17 '12 at 20:10
  • The idea of multiplexing is great but the formula less so: 1) multiply by alpha is missing (as if you had premultiplied) and 2) a division by 255 or approximate >>8 is missing too. – aka.nice Dec 23 '13 at 17:24
  • @aka.nice fixed the code (made separate alpha & premul functions), also added missing shifts. – Jasper Bekkers Jan 08 '14 at 21:20
  • Wow, I've never seen that before. That's really cool. Is it much faster? Also, does it still work on both little/big endian? – Jarno Aug 02 '20 at 23:54
18

For people that want to divide by 255, i found a perfect formula:

pt->r = (r+1 + (r >> 8)) >> 8; // fast way to divide by 255
  • 3
    This can be extended to two 16bits words: ((r+0x10001+((r>>8)&0xFF00FF))>>8) & 0xFF00FF and this allow multiplexing xRxB and AxGx ops in ARGB, similar in RGBA and other variants – aka.nice Dec 23 '13 at 17:14
  • 1
    `(x+1+((x+1)>>8))>>8 // integer div 255 for [0..65790)` -- slightly better – Brent Bradburn Apr 01 '15 at 16:13
  • 3
    `((x+1)*257)>>16 // integer div 255 for [0..65790)` -- alternative formulation which might be faster on some platforms -- interesting notes: [Division via Multiplication](http://research.swtch.com/divmult) – Brent Bradburn Apr 01 '15 at 16:42
  • 2
    @nobar: The standard compiler trick of doing division with a multiplicative inverse is also worth considering: [`n/255` compiles to = asm that does `(n*0x8081) >> 23`](https://godbolt.org/g/GUaezV). That also works for all 16-bit `n`. (I just noticed your upper-bound was higher than 65536). With x86 SSE2, that's one `_mm_mulhi_epu16` and one `_mm_srli_epu16(mul, 23-16)`. `x+1 * 257` is one paddw and one pmulhuw, so that's actually better (since mul and shift may compete for the same port). – Peter Cordes Apr 27 '17 at 04:38
7

Here's some pointers.

Consider using pre-multiplied foreground images as described by Porter and Duff. As well as potentially being faster, you avoid a lot of potential colour-fringing effects.

The compositing equation changes from

r =  kA + (1-k)B

... to ...

r =  A + (1-k)B

Alternatively, you can rework the standard equation to remove one multiply.

r =  kA + (1-k)B
==  kA + B - kB
== k(A-B) + B

I may be wrong, but I think you shouldn't need the clamping either...

Roddy
  • 66,617
  • 42
  • 165
  • 277
7

I can't comment because I don't have enough reputation, but I want to say that Jasper's version will not overflow for valid input. Masking the multiplication result is necessary because otherwise the red+blue multiplication would leave bits in the green channel (this would also be true if you multiplied red and blue separately, you'd still need to mask out bits in the blue channel) and the green multiplication would leave bits in the blue channel. These are bits that are lost to right shift if you separate the components out, as is often the case with alpha blending. So they're not overflow, or underflow. They're just useless bits that need to be masked out to achieve expected results.

That said, Jasper's version is incorrect. It should be 0xFF-alpha (255-alpha), not 0x100-alpha (256-alpha). This would probably not produce a visible error.

I've found an adaptation of Jasper's code to be be faster than my old alpha blending code, which was already decent, and am currently using it in my software renderer project. I work with 32-bit ARGB pixels:

Pixel AlphaBlendPixels(Pixel p1, Pixel p2)
{
    static const int AMASK = 0xFF000000;
    static const int RBMASK = 0x00FF00FF;
    static const int GMASK = 0x0000FF00;
    static const int AGMASK = AMASK | GMASK;
    static const int ONEALPHA = 0x01000000;
    unsigned int a = (p2 & AMASK) >> 24;
    unsigned int na = 255 - a;
    unsigned int rb = ((na * (p1 & RBMASK)) + (a * (p2 & RBMASK))) >> 8;
    unsigned int ag = (na * ((p1 & AGMASK) >> 8)) + (a * (ONEALPHA | ((p2 & GMASK) >> 8)));
    return ((rb & RBMASK) | (ag & AGMASK));
}
nfries88
  • 374
  • 6
  • 7
  • This is exactly what I was looking for. Are you certain about the precision with `na = 255 - a` rather than 256 or is it something that can't be helped in this case? – Nolo Dec 10 '16 at 11:56
  • Sorry for late response, haven't had anything to contribute on SO in years until tonight and have been going through old notifications. There's inaccuracies either way due to c/256 not always equaling c/255 but 256 - a is more inaccurate than 255-a. Rounding up by adding to the highest of the bits you lose could reduce the inaccuracy but also isn't perfectly accurate. I'm pretty sure the only way to get perfect accuracy is to divide each color channel individually by 255 which is costly. Jasper's code saturates quickly, while mine tends towards black. – nfries88 Dec 03 '21 at 08:42
4

No exactly answering the question but...

One thing is to do it fast, the other thing is to do it right. Alpha compositing is a dangerous beast, it looks straight forward and intuitive but common errors have been widespread for decades without anybody noticing it (almost)!

The most famous and common mistake is about NOT using premultiplied alpha. I highly recommend this: Alpha Blending for Leaves

  • 2
    It's not necessary to use premultiplied alpha, only to make sure the background color is removed from partially transparent pixels. Removing the background color may be part of the process of converting to premultiplied alpha, but it can be done independently as well. – Mark Ransom Nov 09 '11 at 19:29
3

You can use 4 bytes per pixel in both images (for memory alignment), and then use SSE instructions to process all channels together. Search "visual studio sse intrinsics".

Eric Bainville
  • 9,738
  • 1
  • 25
  • 27
3

First of all lets use the proper formula for each color component

You start with this:

  v = ( 1-t ) * v0 + t * v1

where t=interpolation parameter [0..1] v0=source color value v1=transfer color value v=output value

Reshuffling the terms, we can reduce the number of operations:

  v = v0 + t * (v1 - v0)

You would need to perform this calculation once per color channel (3 times for RGB).

For 8-bit unsigned color components, you need to use correct fixed point math:

  i = i0 + t * ( ( i1 - i0 ) + 127 ) / 255

where t = interpolation parameter [0..255] i0= source color value [0..255] i1= transfer color value [0..255] i = output color

If you leave out the +127 then your colors will be biased towards the darker end. Very often, people use /256 or >> 8 for speed. This is not correct! If you divide by 256, you will never be able to reach pure white (255,255,255) because 255/256 is slightly less than one.

I hope this helps.

Vinnie Falco
  • 5,173
  • 28
  • 43
  • Interesting ideas there, but you do pay a steep price for your / 255. You have to calculate an intermediate 16 bit result using t * ( ( v1 - v0 ) + 127 ) that you then divide. Are you sure that your formula is really simpler than ( 1-t ) * v0 + t * v1 ? Remember that 1-t is pre-calculated and that / is often more expensive than * – Tom Leys Aug 05 '09 at 23:34
  • 1
    The formula is a reference for what the numerically correct formula looks like. It is certainly slower, however the results are accurate. It is useful to know what the right answer looks like in order to determine if the error in the optimized result is acceptible or not. – Vinnie Falco Aug 07 '09 at 09:39
  • Yes, i = i0 + t * ( ( i1 - i0 ) + 127 ) / 255 is more efficient than your formula, which for integers would be (I think) : i = ( ( 255 - t ) * i0 + ( t * i1 ) ) / 255 – Vinnie Falco Aug 07 '09 at 09:42
  • Most images on the PC have Gamma burnt in. So if it's pixel value is say 127, that's NOT exactly half way between white and black. It's actual brighness is.. powf( (c) / 255.f, gamma) .. or about 0.19 So all your calculations that assumes pixels brightness is linear are wrong. – Jeff McClintock Mar 31 '10 at 23:15
  • With Guilerme answer this makes for a correct and fast blending. Thanks! – ponce Dec 11 '11 at 19:25
  • If you want accuracy, you have to make sure RGB are linear. Many people alpha blend with gamma-corrected RGB and get bad results. – Adrian McCarthy Aug 17 '12 at 22:37
  • @VinnieFalco: Assuming `i`,`i0`,`i1`,`t` are 8-bit `unsigned char`, how does `i = i0 + t * ( ( v1 - v0 ) + 127 ) / 255` prevent overflow or underflow? Also, the equation uses `v0` and `v1` and doesn't use `i1`. There seems to be some mix-up here. – Adi Shavit May 28 '14 at 12:01
  • @AdiShavit You are correct, there was a mixup. There was a typo in the variables. v, v0, and v1 refer to floating point luminances while i, i0, and i1 refer to integer luminances (range 0..255). I have corrected the original text. The formula requires that t, i0, i1 are in the range [0, 255]. If you want to prevent underflow or overflow you would need to clamp your values. What the formula does is evenly distribute the output values across the entire range of possible brightness values. Without the rounding (+127), there would be more combinations of inputs that result in 0 instead of 255. – Vinnie Falco May 28 '14 at 14:48
2

I've done similar code in unsafe C#. Is there any reason you aren't looping through each pixel directly? Why use all the BYTE* and GET_BYTE() calls? That is probably part of the speed issue.

What does GET_GRAY look like?

More importantly, are you sure your platform doesn't expose alpha blending capabilities? What platform are you targeting? Wiki informs me that the following support it out of the box:

  • Mac OS X
  • Windows 2000, XP, Server 2003, Windows CE, Vista and Windows 7
  • The XRender extension to the X Window System (this includes modern Linux systems)
  • RISC OS Adjust
  • QNX Neutrino
  • Plan 9
  • Inferno
  • AmigaOS 4.1
  • BeOS, Zeta and Haiku
  • Syllable
  • MorphOS
colithium
  • 10,269
  • 5
  • 42
  • 57
  • This alpha blend is used for a certain image enhancement algorithm, not for displaying. So I can not use platform capabilities. Thanks! remove most GET_BYTE() seems useless, maybe the multiply operation and divid 255 operation is the problem. – user25749 Jul 09 '09 at 13:27
  • 2
    Even if you aren't displaying the image you can still definitely use platform capabilities. For example, on Windows you can use GDI+ or the .NET wrappers to do alpha blending without ever displaying it. I'd assume other platforms are similar. – colithium Jul 09 '09 at 21:26
2

I think hardware support will help you. try to move the logic from software to hardware if feasible

Umair Ahmed
  • 11,238
  • 5
  • 33
  • 39
2

The main problem will be the poor loop construct, possibly made worse by a compiler failing to eliminate CSE's. Move the real common bits outside the loops. int red isn't common, thouigh - that should be inside the inner loop.

Furthermore, red, green and blue are independent. If you calculate them in turn, you don't need to keep interim red results in registers when you are calculating green results. This is especially important on CPUs with limited registers like x86.

There will be only a limited number of values allowed for bytepp. Make it a template parameter, and then call the right instantiation from a switch. This will produce multiple copies of your function, but each can be optimized a lot better.

As noted, clamping is not needed. In alphablending, you're creating a linear combination of two images a[x][y] and b[x][y]. Since 0<=alpha<=255, you know that each output is bound by max(255*a[x][y], 255*b[x][y]). And since your output range is the same as both input ranges (0-255), this is OK.

With a small loss of precision, you could calculate (a[x][y]*alpha * b[x][y]*(256-alpha))>>8. Bitshifts are often faster than division.

MSalters
  • 173,980
  • 10
  • 155
  • 350
  • 2
    Modern CPUs prefer interleaved instructions as much as possible. This is because independent work (i.e calculating R while G is processing) suits the pipelined nature of modern CPUs well. See Intel optimisation manual : http://www.intel.com/Assets/PDF/manual/248966.pdf. - The registers might seem limited to you, but the CPU has many more actual registers than you think using "register renaming" – Tom Leys Jul 09 '09 at 23:28
1

Depending on the target architecture, you could try either vectorize or parallellize the function.

Other than that, try to linearize the whole method (i.e. no loop-in-loop) and work with a quadruple of bytes at once, that would lose the overhead of working with single bytes plus make it easier for the compiler to optimize the code.

Christoffer
  • 12,712
  • 7
  • 37
  • 53
1

Move it to the GPU.

Crashworks
  • 40,496
  • 12
  • 101
  • 170
1

I am assuming that you want to do this in a completely portable way, without the help of a GPU, the use of a proprietry intel SIMD library (which may not work as efficiently on AMD processors).

Put the following inplace of your calculation for RGB

R = TopR + (SourceR * alpha) >> 8;
G = TopG + (SourceG * alpha) >> 8;
B = TopB + (SourceB * alpha) >> 8; 

It is a more efficient calculation.

Also use shift left instruction on your get pixel macro instead of multiplying by the BPP.

Adrian Regan
  • 2,240
  • 13
  • 11
1

This one works when the first color, (colora, the destination) has also alpha channel (blending two transparent ARGB colors) The alpha is in the second color's alpha (colorb, the source)

This adds the two alphas (0 = transparent, 255 = fully opaque) It is a modified version of Jasper Bekkers' answer.

I use it to blend transparent pixel art on to a transparent screen.

Uint32 alphaBlend(unsigned int colora, unsigned int colorb) {
    unsigned int a2  = (colorb & 0xFF000000) >> 24;
    unsigned int alpha = a2;
    if (alpha == 0) return colora;
    if (alpha == 255) return colorb;
    unsigned int a1  = (colora & 0xFF000000) >> 24;
    unsigned int nalpha = 0x100 - alpha;
    unsigned int rb1 = (nalpha * (colora & 0xFF00FF)) >> 8;
    unsigned int rb2 = (alpha * (colorb & 0xFF00FF)) >> 8;
    unsigned int g1  = (nalpha * (colora & 0x00FF00)) >> 8;
    unsigned int g2  = (alpha * (colorb & 0x00FF00)) >> 8;
    unsigned int anew = a1 + a2;
    if (anew > 255) {anew = 255;}
    return ((rb1 + rb2) & 0xFF00FF) + ((g1 + g2) & 0x00FF00) + (anew << 24);
}
0

Here's my adaption of a software alpha blend that works well for 2 unsigned integers.

My code differs a bit as the code above is basically always assuming the destination alpha is 255.

With a decent optimizing compiler most calculations should be in registers as the scope of most variables is very short. I also opted to progressively shift the result << 8 incrementally to avoid << 24, << 16 when putting the ARGB back together. I know it's a long time ago... but I remember on the 286 cycles for a shift was (1 + 1*each bit shifted) so assume there is still some sort of penalty for larger shifts.

Also... instead of "/ 255" I opted for ">> 8" which can be changed as desired.

/*
    alpha blend source and destination, either may have an alpha!!!!

    Src  AAAAAAAA RRRRRRRR GGGGGGGG BBBBBBBB
    Dest AAAAAAAA RRRRRRRR GGGGGGGG BBBBBBBB

    res  AAAAAAAA RRRRRRRR GGGGGGGG BBBBBBBB

    NOTE - α = αsrc + αdest(1.0-αsrc)  where α = 0.0 - 1.0

    ALSO - DWORD is unsigned int so (F8000000 >> 24) = F8 not FFFFFFF8 as it would with int (signed)
    */

    inline DWORD raw_blend(const DWORD src, const DWORD dest)
    {       
        // setup and calculate α

        DWORD src_a = src >> 24;       
        DWORD src_a_neg = 255 - src_a;
        DWORD dest_a = dest >> 24;

        DWORD res = src_a + ((dest_a * src_a_neg) >> 8);

        // setup and calculate R

        DWORD src_r = (src >> 16) & 255;
        DWORD dest_r = (dest >> 16) & 255;

        res = (res << 8) | (((src_r * src_a) + (dest_r * src_a_neg)) >> 8);

        // setup and calculate G

        DWORD src_g = (src >> 8) & 255;
        DWORD dest_g = (dest >> 8) & 255;

        res = (res << 8) | (((src_g * src_a) + (dest_g * src_a_neg)) >> 8);

        // setup and calculate B

        DWORD src_b = src & 255;
        DWORD dest_b = dest & 255;

        return (res << 8) | (((src_b * src_a) + (dest_b * src_a_neg)) >> 8);
    }
TheWhitde
  • 21
  • 2
0
; In\   EAX = background color (ZRBG) 32bit (Z mean zero, always is zero)
; In\   EDX = foreground color (RBGA) 32bit
; Out\  EAX = new color
; free registers (R10, RDI, RSI, RSP, RBP)
abg2:
    mov r15b, dl                ; av
    movzx ecx, dl
    not ecx                     ; faster than 255 - dl
    mov r14b, cl                ; rem

    shr edx, 8
    and edx, 0x00FFFFFF
    mov r12d, edx
    mov r13d, eax               ; RBGA ---> ZRGB

    ; s: eax
    ; d: edx

    ;=============================red = ((s >> 16) * rem + (d >> 16) * av) >> 8;
    mov edx, r12d
    shr edx, 0x10
    movzx eax, r14b
    imul edx, eax
    mov ecx, r13d
    shr ecx, 0x10
    movzx eax, r15b
    imul eax, ecx
    lea eax, [eax + edx]                    ; faster than add eax, edx
    shr eax, 0x8
    mov r9b, al
    shl r9d, 8

    ;=============================green = (((s >> 8) & 0x0000ff) * rem + ((d >> 8) & 0x0000ff) * av) >> 8;
    mov eax, r12d
    shr eax, 0x8
    movzx edx, al
    movzx eax, r14b
    imul edx, eax
    mov eax, r13d
    shr eax, 0x8
    movzx ecx, al
    movzx eax, r15b
    imul eax, ecx
    lea eax, [eax, + edx]                   ; faster than add eax, edx
    shr eax, 0x8
    mov r9b, al
    shl r9d, 8

    ;=============================blue = ((s & 0x0000ff) * rem + (d & 0x0000ff) * av) >> 8;
    movzx edx, r12b
    movzx eax, r14b
    imul edx, eax
    movzx ecx, r13b
    movzx eax, r15b
    imul eax, ecx
    lea eax, [eax + edx]                ; faster than add eax, edx
    shr eax, 0x8
    mov r9b, al


    mov eax, r9d
    ret
Mahdi Mohammadi
  • 239
  • 2
  • 7