What have I done wrong Converting my MMX Intrinsics to x64 (SSE)?

Question

I understand converting MMX 32bit mmx intrinsics no longer allows the __m64. So I was having great trouble upgrading this piece of code to SSE. I was told on another stack-Overflow post to post my code. Perhaps this exercise will help others as well.

I commented out '_mm_empty' thinking that was the right thing to do. I found like functions in the emmintrin.h for all the other __m128i opertions, but something is still wrong.

original 32-bit function code:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
    ////////////////////////////////////////////////////////////
    // get local vars representing parameters from original call
    ushort* arrayIn     = m_taskdata.arrayIn;
    ushort arrayLen0    = m_taskdata.arrayLen0;
    ushort arrayLen1    = m_taskdata.arrayLen1;
    ushort* kernel      = m_taskdata.kernel;
    ushort kernelLen    = m_taskdata.kernelLen;
    uint32_t* norm_r        = m_taskdata.norm_r;
    ushort* outputArray = m_taskdata.outputArray;

    ushort* interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////////////////

    ushort tailLength = (ushort)((kernelLen - 1) / 2);

    _ASSERTE(interArray);

    //ushort* pRow = NULL; // the current row
    //ushort* pInterRow = NULL; // the current row in the interarray

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1;
    INT_PTR lpKernel;

    // adjust for non-zero start
    lpRow += m_nRowStart*rowstride;
    lpInterRow += m_nRowStart*rowstride;

    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; // use this to work inside the edges

    int h, i;
    uint sum, points;
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

    // use this for MMX optimizations
    int fourcount = kernelLen/4;
    int remainingcount = kernelLen%4;
    int mmxcount = 4*fourcount; // this is where the remainder is handled
    int loopcount = 0; // use the for fast looping tests

    _mm_empty();
    __m64 accu, temp;
    __m64 shifter = _m_from_int(32);

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
    {
        // skip over left edge
        lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

        for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
        {
            sum = 0;
            points = 0;
            lpKernel = (INT_PTR)kernel;

            lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

            // MMX Optimizations
            accu = _mm_setzero_si64(); // zero the accumulator

            // VECTOR processing
            for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

                // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
                // _mm_add_pi32/_m_paddd: 2*32bit add 
                temp = _m_pmaddwd(*(__m64*)lpKernel, *(__m64*)lpInnerPixels);

                accu = _mm_add_pi32(accu, temp); // each double word has a partial sum

                lpKernel += 8; lpInnerPixels += 8;

            } // loop over the kernel

            // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
            // and finally store the result into the variable "accu"
            accu = _mm_add_pi32(accu, _mm_srl_si64(accu, shifter)); // combine results from upper and lower double words

            sum = _m_to_int(accu); // move mmx result to sum

            // SCALAR
            for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
                sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
                //points++;
                lpKernel += 2; lpInnerPixels += 2;
            } // loop over the kernel


            //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

            *(ushort*)lpInterRowInside = (ushort)(sum/norm);
            lpInterRowInside += 2; // move to next column sizeof(ushort)
        } // for each column


        lpRow += rowstride; // move to next row ( h * arrayLen1 )
        lpInterRow += rowstride;


    } // for each row

    _mm_empty();

    return 0;

}

64 Bit Attempt:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
    ////////////////////////////////////////////////////////////
    // get local vars representing parameters from original call
    ushort* arrayIn     = m_taskdata.arrayIn;
    ushort arrayLen0    = m_taskdata.arrayLen0;
    ushort arrayLen1    = m_taskdata.arrayLen1;
    ushort* kernel      = m_taskdata.kernel;
    ushort kernelLen    = m_taskdata.kernelLen;
    uint32_t* norm_r        = m_taskdata.norm_r;
    ushort* outputArray = m_taskdata.outputArray;

    ushort* interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////////////////

    ushort tailLength = (ushort)((kernelLen - 1) / 2);

    _ASSERTE(interArray);

    //ushort* pRow = NULL; // the current row
    //ushort* pInterRow = NULL; // the current row in the interarray

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1;
    INT_PTR lpKernel;

    // adjust for non-zero start
    lpRow += m_nRowStart*rowstride;
    lpInterRow += m_nRowStart*rowstride;


    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; // use this to work inside the edges

    int h, i;
    uint sum, points;
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

    // use this for MMX optimizations
    int fourcount = kernelLen/4;
    int remainingcount = kernelLen%4;
    int mmxcount = 4*fourcount; // this is where the remainder is handled
    int loopcount = 0; // use the for fast looping tests

    //_mm_empty();
    __m128i accu, temp;
    __m128i shifter = _mm_cvtsi32_si128(32);

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
    {
        // skip over left edge
        lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

        for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
        {
            sum = 0;
            points = 0;
            lpKernel = (INT_PTR)kernel;

            lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

            // MMX Optimizations
            accu = _mm_setzero_si128(); // zero the accumulator

            // VECTOR processing
            for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

                // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
                // _mm_add_pi32/_m_paddd: 2*32bit add 
                //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
                temp = _mm_madd_epi16(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);

                accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

                lpKernel += 8; lpInnerPixels += 8;

            } // loop over the kernel

            // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
            // and finally store the result into the variable "accu"
            accu = _mm_add_epi32(accu, _mm_sll_epi64(accu, shifter)); // combine results from upper and lower double words

            sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

            // SCALAR
            for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
                sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
                //points++;
                lpKernel += 2; lpInnerPixels += 2;
            } // loop over the kernel


            //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

            *(ushort*)lpInterRowInside = (ushort)(sum/norm);
            lpInterRowInside += 2; // move to next column sizeof(ushort)
        } // for each column


        lpRow += rowstride; // move to next row ( h * arrayLen1 )
        lpInterRow += rowstride;


    } // for each row

    //_mm_empty();

    return 0;

}

If I were you, my first step would be to make a "test" project where I was simply trying to verify that loading, summing, storing, etc of my SSE registers with intrinsics was behaving as intended. — RyanP, Sep 09 '15 at 12:17
But for the record `(__m128i*)lpKernel` won't work. You actually need to use the aligned and/or unaligned load commands to load things from aligned and unaligned memory into SSE registers before you can work on them. — RyanP, Sep 09 '15 at 12:19
You mean one of these: @RyanP `extern __m128i _mm_load_si128(__m128i const*_P); `code`extern __m128i _mm_loadu_si128(__m128i const*_P); extern __m128i _mm_loadl_epi64(__m128i const*_P);` — Robert Koernke, Sep 10 '15 at 13:33
My solution no longer fails, but the image is now 'convoluted'..hehe, couldn't resist. Meaning I can see what it supposed to look like, but there are major problems. So @RyanP was correct.. The casting above was causing the catastrophic failure. But somewhere else is still not correct. Current revision looks like this: `mlpkernel = _mm_cvtsi64_si128(lpKernel); mlpInnerPixels = _mm_cvtsi64_si128(lpInnerPixels); temp = _mm_madd_epi16(mlpkernel, mlpInnerPixels); ` — Robert Koernke, Sep 10 '15 at 15:12
It's working now.. I'll post the full code in the answer below. But here are the issues, including my last comment that was wrong: `mlpkernel = _mm_cvtsi64_si128(*(__int64*)lpKernel); mlpInnerPixels = _mm_cvtsi64_si128(*(__int64*)lpInnerPixels);` Then: `accu = _mm_add_epi32(accu, _mm_srl_epi64(accu, shifter));` That's 'srl' instead of 'sll' — Robert Koernke, Sep 15 '15 at 17:04

Robert Koernke · Accepted Answer · 2015-09-17T21:14:10.740

With all the issues fixed mentioned above in the comments. Here is the final working x64 SSE Convolution code:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn     = m_taskdata.arrayIn;
ushort arrayLen0    = m_taskdata.arrayLen0;
ushort arrayLen1    = m_taskdata.arrayLen1;
ushort* kernel      = m_taskdata.kernel;
ushort kernelLen    = m_taskdata.kernelLen;
uint32_t* norm_r        = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;

ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////

ushort tailLength = (ushort)((kernelLen - 1) / 2);

_ASSERTE(interArray);

//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray

INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;

// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;


// want to process only those (edge) pixels that need the innner loop condition 
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges

int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests

//_mm_empty();
__m128i accu, temp, mlpkernel, mlpInnerPixels;
__m128i shifter = _mm_cvtsi32_si128(32);

for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
    // skip over left edge
    lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

    for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
    {
        sum = 0;
        points = 0;
        lpKernel = (INT_PTR)kernel;

        lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

        // MMX Optimizations
        accu = _mm_setzero_si128(); // zero the accumulator

        // VECTOR processing
        for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
        {
            //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

            // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
            // _mm_add_pi32/_m_paddd: 2*32bit add 
            //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
            //mlpkernel = _mm_cvtsi32_si128(lpKernel);
            mlpkernel = _mm_cvtsi64_si128(*(__int64*)lpKernel);
            mlpInnerPixels = _mm_cvtsi64_si128(*(__int64*)lpInnerPixels);
            temp = _mm_madd_epi16(mlpkernel, mlpInnerPixels);

            accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

            lpKernel += 8; lpInnerPixels += 8;

        } // loop over the kernel

        // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
        // and finally store the result into the variable "accu"
        accu = _mm_add_epi32(accu, _mm_srl_epi64(accu, shifter)); // combine results from upper and lower double words

        sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

        // SCALAR
        for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
        {
            //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
            sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
            //points++;
            lpKernel += 2; lpInnerPixels += 2;
        } // loop over the kernel


        //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

        *(ushort*)lpInterRowInside = (ushort)(sum/norm);
        lpInterRowInside += 2; // move to next column sizeof(ushort)
    } // for each column


    lpRow += rowstride; // move to next row ( h * arrayLen1 )
    lpInterRow += rowstride;


} // for each row

//_mm_empty();

return 0;

}

What have I done wrong Converting my MMX Intrinsics to x64 (SSE)?

1 Answers1

Linked