Dot production using sse

Question

#define Size 50000

void main()
{

    unsigned char *arry1 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
    unsigned char *arry2 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
    unsigned int *result = (unsigned int*)malloc(sizeof(unsigned int)* Size);


    for (int i = 0; i < 16; i++)
    {
        arry1[i]  = i;
        arry2[i]  = i;
    }


    __m128i Z = _mm_setzero_si128();
    __m128i vsum = _mm_setzero_si128();
    //__m128i dummy = _mm_setzero_si128();

    for (int j = 0; j < 16; j += 16)
    {
        //printf("%d\n\n", j);

        __m128i test1 = _mm_setzero_si128();
            test1 = _mm_loadu_si128((__m128i*)&arry1[j]);
            __m128i test2 = _mm_setzero_si128();
            test2 = _mm_loadu_si128((__m128i*)&arry2[j]);

        __m128i s16L = _mm_unpacklo_epi8(test1, Z);
        __m128i s16H = _mm_unpackhi_epi8(test1, Z);
        __m128i s32LL = _mm_unpacklo_epi16(s16L, Z);
        __m128i s32LH = _mm_unpackhi_epi16(s16L, Z);
        __m128i s32HL = _mm_unpacklo_epi16(s16H, Z);
        __m128i s32HH = _mm_unpackhi_epi16(s16H, Z);

        __m128i t16L = _mm_unpacklo_epi8(test2, Z);
        __m128i t16H = _mm_unpackhi_epi8(test2, Z);
        __m128i t32LL = _mm_unpacklo_epi16(t16L, Z);
        __m128i t32LH = _mm_unpackhi_epi16(t16L, Z);
        __m128i t32HL = _mm_unpacklo_epi16(t16H, Z);
        __m128i t32HH = _mm_unpackhi_epi16(t16H, Z);

        __m128 s1 = _mm_cvtepi32_ps(s32LL);
        __m128 s2 = _mm_cvtepi32_ps(s32LH);
        __m128 s3 = _mm_cvtepi32_ps(s32HL);
        __m128 s4 = _mm_cvtepi32_ps(s32HH);

        __m128 t1 = _mm_cvtepi32_ps(t32LL);
        __m128 t2 = _mm_cvtepi32_ps(t32LH);
        __m128 t3 = _mm_cvtepi32_ps(t32HL);
        __m128 t4 = _mm_cvtepi32_ps(t32HH);

        s1 = _mm_mul_ps(s1, t1);
        s2 = _mm_mul_ps(s2, t2);
        s3 = _mm_mul_ps(s3, t3);
        s4 = _mm_mul_ps(s4, t4);

        s1 = _mm_hadd_ps(s1, s2);//41,13
        s3 = _mm_hadd_ps(s3, s4); //313,221

        vsum = _mm_cvtps_epi32(s3);

        for (int k = 0; k < 16; k++)
        {
            printf("%u\n", (unsigned char)vsum.m128i_i8[k]);
        }

        s1 = _mm_hadd_ps(s1, s3); //734, 14
        s1 = _mm_hadd_ps(s1, s1); //1100,140
        s1 = _mm_hadd_ps(s1, s1); //1240


    }


}

I progressing dot production using sse. I'm using _mm_mul_ps and _mm_hadd_ps instruction not _mm_dp_ps. If values that after _mm_hadd_ps function is over 255, it is displayed wrong value.

For example, correct value of s3 is {0,0,0,421,0,0,0,313,0,0,0,221,0,0,0,145}. But {0,0,1,165,0,0,1,57,0,0,0,221,0,0,0,145} is printed. Is this the result I declared arry1, arry2 as unsigned char? I know 255 is max value by 8-bit.

`s3` is 4 x 32 bit ints but you are displaying it as 16 x 8 bit unsigned chars. — Paul R, Mar 01 '17 at 22:44

score 3 · Accepted Answer · answered Mar 02 '17 at 05:53

I can see here some problems:

1) If you want to calculate dot product of 50000 uint8_t values, it is OK. But dot product of 70000 values can cause of overflow of uint32_t type. Therefore using of uint64_t is better solution.

2) To calculate dot product of integer vectors it is not necessary to use float point numbers. It is more effective to use for calculation only integers.

There is an example of SSE2 function which calculates dot product for two uint8_t vectors:

#include <algorithm>
#include <emmintrin.h>

const __m128i Z = _mm_setzero_si128();
const size_t A = sizeof(__m128i);
const size_t B = 0x10000;

inline __m128i DotProduct32(const uint8_t * a, const uint8_t * b)
{
    __m128i _a = _mm_loadu_si128((__m128i*)a);
    __m128i _b = _mm_loadu_si128((__m128i*)b);
    __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(_a, Z), _mm_unpacklo_epi8(_b, Z));
    __m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(_a, Z), _mm_unpackhi_epi8(_b, Z));
    return _mm_add_epi32(lo, hi);
}

inline __m128i HorizontalSum32(__m128i a)
{
    return _mm_add_epi64(_mm_unpacklo_epi32(a, Z), _mm_unpackhi_epi32(a, Z));
}

inline uint64_t ExtractSum64(__m128i a)
{
    uint64_t  _a[2];
    _mm_storeu_si128((__m128i*)_a, a);
    return _a[0] + _a[1];
}

void DotProduct(const uint8_t * a, const uint8_t * b, size_t size, uint64_t * sum)
{
    size_t blockNumber = (size + B - 1)/B;
    size_t alignedSize = size/A*A;
    size_t i = 0;

    __m128i sum64 = Z;
    for (size_t block = 0; block < blockNumber; ++i)
    {
        __m128i sum32 = Z;
        for (size_t blockEnd = std::min(alignedSize, i + B); i < blockEnd; i += A)
            sum32 = _mm_add_epi32(sum32, DotProduct32(a + i, b + i));
        sum64 = _mm_add_epi64(sum64, HorizontalSum32(sum32));
    }

    *sum = ExtractSum64(sum64);
    for (; i < size; ++i)
        *sum += a[i] * b[i];
}

Dot production using sse

1 Answers1