64bit multiply element by element,m256i_i64 while bigger than long long maxValue

Question

union sseUnion
{
    int64_t position[4];
    btSimdFloat4 mVec256;
};

// vector operator * : multiply element by element
__m256i mul64_haswell_mul(__m256i a, __m256i b) {
    // instruction does not exist. Split into 32-bit multiplies
    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);           // swap H<->L
    __m256i prodlh = _mm256_mullo_epi32(a, bswap);            // 32 bit L*H products
    __m256i zero = _mm256_setzero_si256();                 // 0
    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);         // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);     // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
    __m256i prodll = _mm256_mul_epu32(a, b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
    __m256i prod = _mm256_add_epi64(prodll, prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
    
    return  prod;
 }

int main()
{
    sseUnion _sseUnion;
    _sseUnion.mVec256 = _mm256_set_epi64x(1000000, 1000000, 1000000, 1000000);
    sseUnion a2;
    a2.mVec256 = _mm256_setr_epi64x(401000000, 401000000, 401000000, 401000000);
    a2.mVec256 = _mm256_add_epi64(_sseUnion.mVec256, a2.mVec256);

    a2.mVec256 = mul64_haswell_mul(_sseUnion.mVec256, a2.mVec256);
    a2.mVec256 = mul64_haswell_mul(_sseUnion.mVec256, a2.mVec256);

    printf("%d", a2.mVec256.m256i_i64[0]);
}

a2.position[0-4] while bigger than int64_t maxValue, and I get a wrong value, because it's real value is 14618374452099416064. I just wanna change it to int64_t maxValue, what can I do for it?

I assume you want to do a 64*64 bit multiply into a 128 bit result. _mm256_mullo_epi64 gives a 64*64 bit multiply with only 64 bit result if your computer has AVX512. — A Fog, Jun 06 '22 at 04:50
This post may be helpful: https://stackoverflow.com/questions/60292916/how-do-i-use-mmx-mulh-and-mull-for-two-64-bit-integers-to-get-one-128-bit-intege/60293076#60293076 — A Fog, Jun 06 '22 at 04:50
You want a (signed) `int64 * int64` which saturates in case of overflows? Would an unsigned product also work for you? Can overflows happen at both ends? (I assume results smaller than `int64_t minValue` should saturate to that) — chtz, Jun 06 '22 at 15:25
_mm256_mask_i64gather_epi64,finally I use it replace the result — 张文阳, Jun 09 '22 at 08:22

张文阳 · Answer 1 · 2022-06-09T08:28:32.453

    __m256i vindex = _mm256_set_epi64x(0, 0, 0, 0);
    int64_t  overFlowValue = 0x8000000000000000;
    int64_t  maxValue = 0x7FFFFFFFFFFFFFFF;
    __m256i mask = _mm256_set_epi64x(overFlowValue, overFlowValue, overFlowValue, overFlowValue);
    __m256i max = _mm256_set_epi64x(maxValue, maxValue, maxValue, maxValue);
    __m256i signa = _mm256_and_si256(a, mask);
    __m256i signb = _mm256_and_si256(b, mask);

    __m256i absA = _mm256_sub_epi64(_mm256_xor_si256(a, signa), signa);
    __m256i absB = _mm256_sub_epi64(_mm256_xor_si256(b, signb), signb);
   __m256i prod = mul64_mul(absA, absB);
    __m256i resultSign = _mm256_and_si256(prod, mask);
    __m256i  result = _mm256_mask_i64gather_epi64(prod, max.m256i_i64, vindex, resultSign, 1);
    __m256i resultSign1 = _mm256_xor_si256(signa, signb);
    __m256i result1 = _mm256_sub_epi64(_mm256_xor_si256(result, resultSign1), resultSign1);

__m256i mul64_mul(__m256i a, __m256i b)
{
    __m256i bswap = _mm256_shuffle_epi32(a, 0xB1);           // swap H<->L
    __m256i prodlh = _mm256_mullo_epi32(b, bswap);            // 32 bit L*H products
    __m256i zero = _mm256_setzero_si256();                 // 0
    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);         // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);     // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
    __m256i prodll = _mm256_mul_epu32(a, b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
    __m256i prod = _mm256_add_epi64(prodll, prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32

    return prod;
}

and i get right result by this way.

As it’s currently written, your answer is unclear. Please [edit] to add additional details that will help others understand how this addresses the question asked. You can find more information on how to write good answers [in the help center](/help/how-to-answer). — Community, Jun 09 '22 at 13:09

64bit multiply element by element,m256i_i64 while bigger than long long maxValue

1 Answers1