1

I have tried my best to get the vectorclass library to generate AVX2 instructions, but can't get it to.

I'm using MSVC2019. Here are the compile options: /permissive- /ifcOutput "x64\Release" /GS /Qpar /GL /W3 /Gy /Zc:wchar_t /I"D:\Tools\vectorclass" /I"D:\Tools\libzmq/include" /I"D:\Tools\boost\boost_1_79_0" /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc142.pdb" /Zc:inline /D "__AVX2__" /D "ZMQ_STATIC" /D "FILE_INPUT" /D "NDEBUG" /D "WIN32" /D "_CRT_SECURE_NO_WARNINGS" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /std:c17 /arch:AVX2 /Gd /Oi /MT /std:c++17 /FC /Fa"x64\Release" /EHsc /nologo /Fo"x64\Release" /Ot /Fp"x64\Release\RtnLink_MSVC.pch" /diagnostics:column

In addition I've tried to force it with macro definitions AVX2 and INSTRSET but no luck.

#define INSTRSET (8)
#define __AVX2__
#pragma warning(disable : 4984)  //warning C4984: 'if constexpr' is a C++17 language extension
#include "vectorclass.h"
size_t test(size_t size) {
    Vec8ui incr(8);
    Vec8ui accum(0, 1, 2, 3, 4, 5, 6, 7);

    for (size_t i = 8; i < size; i += 8) {
        accum = accum + accum;
    }
    size_t result = horizontal_max(accum);


    const __m256i incr2 = _mm256_set1_epi32(8);
    __m256i accum2 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
    for (size_t i = 8; i < size; i += 8) {
        accum2 = _mm256_add_epi32(accum2, incr2);
    }
    __declspec(align(32)) int32_t values_array[8];
    _mm256_store_si256((__m256i*)values_array, accum2);
    size_t result2 = values_array[0];
    for (int i = 1; i < 8; i++) {
        if (values_array[i] > result2) {
            result2 = values_array[i];
        }
    }

    return result;
}

This compiles to the following 2 loops:

Using vectorclass (no AVX2 instructions):

    Vec8ui incr(8);
00007FF7A9BC2E5A  mov         edx,8  
00007FF7A9BC2E5F  lea         rcx,[incr]  
00007FF7A9BC2E63  call        Vec8ui::Vec8ui (07FF7A9B58BFEh)  
    Vec8ui accum(0, 1, 2, 3, 4, 5, 6, 7);
00007FF7A9BC2E68  mov         dword ptr [rsp+40h],7  
00007FF7A9BC2E70  mov         dword ptr [rsp+38h],6  
00007FF7A9BC2E78  mov         dword ptr [rsp+30h],5  
00007FF7A9BC2E80  mov         dword ptr [rsp+28h],4  
00007FF7A9BC2E88  mov         dword ptr [rsp+20h],3  
00007FF7A9BC2E90  mov         r9d,2  
00007FF7A9BC2E96  mov         r8d,1  
00007FF7A9BC2E9C  xor         edx,edx  
00007FF7A9BC2E9E  lea         rcx,[accum]  
00007FF7A9BC2EA2  call        Vec8ui::Vec8ui (07FF7A9B54B99h)  

    for (size_t i = 8; i < size; i += 8) {
00007FF7A9BC2EA7  mov         qword ptr [rbp+98h],8  
00007FF7A9BC2EB2  jmp         __$EncStackInitStart+0A2h (07FF7A9BC2EC6h)  
00007FF7A9BC2EB4  mov         rax,qword ptr [rbp+98h]  
00007FF7A9BC2EBB  add         rax,8  
00007FF7A9BC2EBF  mov         qword ptr [rbp+98h],rax  
00007FF7A9BC2EC6  mov         rax,qword ptr [size]  
00007FF7A9BC2ECD  cmp         qword ptr [rbp+98h],rax  
00007FF7A9BC2ED4  jae         __$EncStackInitStart+10Fh (07FF7A9BC2F33h)  
        accum = accum + accum;
00007FF7A9BC2ED6  lea         rax,[rbp+4C0h]  
00007FF7A9BC2EDD  lea         rcx,[accum]  
00007FF7A9BC2EE1  mov         rdi,rax  
00007FF7A9BC2EE4  mov         rsi,rcx  
00007FF7A9BC2EE7  mov         ecx,20h  
00007FF7A9BC2EEC  rep movs    byte ptr [rdi],byte ptr [rsi]  
00007FF7A9BC2EEE  lea         rax,[rbp+480h]  
00007FF7A9BC2EF5  lea         rcx,[accum]  
00007FF7A9BC2EF9  mov         rdi,rax  
00007FF7A9BC2EFC  mov         rsi,rcx  
00007FF7A9BC2EFF  mov         ecx,20h  
00007FF7A9BC2F04  rep movs    byte ptr [rdi],byte ptr [rsi]  
00007FF7A9BC2F06  lea         r8,[rbp+4C0h]  
00007FF7A9BC2F0D  lea         rdx,[rbp+480h]  
00007FF7A9BC2F14  lea         rcx,[rbp+380h]  
00007FF7A9BC2F1B  call        operator+ (07FF7A9BC29C0h)  
00007FF7A9BC2F20  lea         rcx,[accum]  
00007FF7A9BC2F24  mov         rdi,rcx  
00007FF7A9BC2F27  mov         rsi,rax  
00007FF7A9BC2F2A  mov         ecx,20h  
00007FF7A9BC2F2F  rep movs    byte ptr [rdi],byte ptr [rsi]  
    }
00007FF7A9BC2F31  jmp         __$EncStackInitStart+90h (07FF7A9BC2EB4h)  
    size_t result = horizontal_max(accum);
00007FF7A9BC2F33  lea         rax,[rbp+500h]  
00007FF7A9BC2F3A  lea         rcx,[accum]  
00007FF7A9BC2F3E  mov         rdi,rax  
00007FF7A9BC2F41  mov         rsi,rcx  
00007FF7A9BC2F44  mov         ecx,20h  
00007FF7A9BC2F49  rep movs    byte ptr [rdi],byte ptr [rsi]  
00007FF7A9BC2F4B  lea         rcx,[rbp+500h]  
00007FF7A9BC2F52  call        horizontal_max<Vec8ui> (07FF7A9B54FB3h)  
00007FF7A9BC2F57  mov         eax,eax  
00007FF7A9BC2F59  mov         qword ptr [result],rax  

Using intrinsics (we get AVX2 instructions):

    const __m256i incr2 = _mm256_set1_epi32(8);
00007FF7A9BC2F60  vmovdqu     ymm0,ymmword ptr [__ymm@0000000800000008000000080000000800000008000000080000000800000008 (07FF7A9E87940h)]  
00007FF7A9BC2F68  vmovdqu     ymmword ptr [rbp+3C0h],ymm0  
00007FF7A9BC2F70  vmovdqu     ymm0,ymmword ptr [rbp+3C0h]  
00007FF7A9BC2F78  vmovdqu     ymmword ptr [incr2],ymm0  
    __m256i accum2 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
00007FF7A9BC2F80  vmovdqu     ymm0,ymmword ptr [__ymm@0000000700000006000000050000000400000003000000020000000100000000 (07FF7A9E87900h)]  
00007FF7A9BC2F88  vmovdqu     ymmword ptr [rbp+400h],ymm0  
00007FF7A9BC2F90  vmovdqu     ymm0,ymmword ptr [rbp+400h]  
00007FF7A9BC2F98  vmovdqu     ymmword ptr [accum2],ymm0  
    for (size_t i = 8; i < size; i += 8) {
00007FF7A9BC2FA0  mov         qword ptr [rbp+158h],8  
00007FF7A9BC2FAB  jmp         __$EncStackInitStart+19Bh (07FF7A9BC2FBFh)  
00007FF7A9BC2FAD  mov         rax,qword ptr [rbp+158h]  
00007FF7A9BC2FB4  add         rax,8  
00007FF7A9BC2FB8  mov         qword ptr [rbp+158h],rax  
00007FF7A9BC2FBF  mov         rax,qword ptr [size]  
00007FF7A9BC2FC6  cmp         qword ptr [rbp+158h],rax  
00007FF7A9BC2FCD  jae         __$EncStackInitStart+1D5h (07FF7A9BC2FF9h)  
        accum2 = _mm256_add_epi32(accum2, incr2);
00007FF7A9BC2FCF  vmovdqu     ymm0,ymmword ptr [accum2]  
00007FF7A9BC2FD7  vpaddd      ymm0,ymm0,ymmword ptr [incr2]  
00007FF7A9BC2FDF  vmovdqu     ymmword ptr [rbp+440h],ymm0  
00007FF7A9BC2FE7  vmovdqu     ymm0,ymmword ptr [rbp+440h]  
00007FF7A9BC2FEF  vmovdqu     ymmword ptr [accum2],ymm0  
    }
00007FF7A9BC2FF7  jmp         __$EncStackInitStart+189h (07FF7A9BC2FADh)  
    __declspec(align(32)) int32_t values_array[8];
    _mm256_store_si256((__m256i*)values_array, accum2);
00007FF7A9BC2FF9  vmovdqu     ymm0,ymmword ptr [accum2]  
00007FF7A9BC3001  vmovdqa     ymmword ptr [values_array],ymm0  
    size_t result2 = values_array[0];
00007FF7A9BC3009  mov         eax,4  
00007FF7A9BC300E  imul        rax,rax,0  
00007FF7A9BC3012  movsxd      rax,dword ptr values_array[rax]  
00007FF7A9BC301A  mov         qword ptr [result2],rax  
    for (int i = 1; i < 8; i++) {
00007FF7A9BC3021  mov         dword ptr [rbp+1D4h],1  
00007FF7A9BC302B  jmp         __$EncStackInitStart+217h (07FF7A9BC303Bh)  
00007FF7A9BC302D  mov         eax,dword ptr [rbp+1D4h]  
00007FF7A9BC3033  inc         eax  
00007FF7A9BC3035  mov         dword ptr [rbp+1D4h],eax  
00007FF7A9BC303B  cmp         dword ptr [rbp+1D4h],8  
00007FF7A9BC3042  jge         __$EncStackInitStart+250h (07FF7A9BC3074h)  
        if (values_array[i] > result2) {
00007FF7A9BC3044  movsxd      rax,dword ptr [rbp+1D4h]  
00007FF7A9BC304B  movsxd      rax,dword ptr values_array[rax*4]  
00007FF7A9BC3053  cmp         rax,qword ptr [result2]  
00007FF7A9BC305A  jbe         __$EncStackInitStart+24Eh (07FF7A9BC3072h)  
            result2 = values_array[i];
00007FF7A9BC305C  movsxd      rax,dword ptr [rbp+1D4h]  
00007FF7A9BC3063  movsxd      rax,dword ptr values_array[rax*4]  
00007FF7A9BC306B  mov         qword ptr [result2],rax  
blmckinley
  • 11
  • 2
  • Works for me on https://godbolt.org/z/Kezj1beG3 with MSVC 19.14 (2017). Godbolt has some libraries available, but only for its native compilers (including WINE MSVC, but not the other MSVC compilers which indirect to Microsoft servers.) – Peter Cordes Jun 15 '22 at 01:50

0 Answers0