I have tried my best to get the vectorclass library to generate AVX2 instructions, but can't get it to.
I'm using MSVC2019. Here are the compile options: /permissive- /ifcOutput "x64\Release" /GS /Qpar /GL /W3 /Gy /Zc:wchar_t /I"D:\Tools\vectorclass" /I"D:\Tools\libzmq/include" /I"D:\Tools\boost\boost_1_79_0" /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc142.pdb" /Zc:inline /D "__AVX2__" /D "ZMQ_STATIC" /D "FILE_INPUT" /D "NDEBUG" /D "WIN32" /D "_CRT_SECURE_NO_WARNINGS" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /std:c17 /arch:AVX2 /Gd /Oi /MT /std:c++17 /FC /Fa"x64\Release" /EHsc /nologo /Fo"x64\Release" /Ot /Fp"x64\Release\RtnLink_MSVC.pch" /diagnostics:column
In addition I've tried to force it with macro definitions AVX2 and INSTRSET but no luck.
#define INSTRSET (8)
#define __AVX2__
#pragma warning(disable : 4984) //warning C4984: 'if constexpr' is a C++17 language extension
#include "vectorclass.h"
size_t test(size_t size) {
Vec8ui incr(8);
Vec8ui accum(0, 1, 2, 3, 4, 5, 6, 7);
for (size_t i = 8; i < size; i += 8) {
accum = accum + accum;
}
size_t result = horizontal_max(accum);
const __m256i incr2 = _mm256_set1_epi32(8);
__m256i accum2 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
for (size_t i = 8; i < size; i += 8) {
accum2 = _mm256_add_epi32(accum2, incr2);
}
__declspec(align(32)) int32_t values_array[8];
_mm256_store_si256((__m256i*)values_array, accum2);
size_t result2 = values_array[0];
for (int i = 1; i < 8; i++) {
if (values_array[i] > result2) {
result2 = values_array[i];
}
}
return result;
}
This compiles to the following 2 loops:
Using vectorclass (no AVX2 instructions):
Vec8ui incr(8);
00007FF7A9BC2E5A mov edx,8
00007FF7A9BC2E5F lea rcx,[incr]
00007FF7A9BC2E63 call Vec8ui::Vec8ui (07FF7A9B58BFEh)
Vec8ui accum(0, 1, 2, 3, 4, 5, 6, 7);
00007FF7A9BC2E68 mov dword ptr [rsp+40h],7
00007FF7A9BC2E70 mov dword ptr [rsp+38h],6
00007FF7A9BC2E78 mov dword ptr [rsp+30h],5
00007FF7A9BC2E80 mov dword ptr [rsp+28h],4
00007FF7A9BC2E88 mov dword ptr [rsp+20h],3
00007FF7A9BC2E90 mov r9d,2
00007FF7A9BC2E96 mov r8d,1
00007FF7A9BC2E9C xor edx,edx
00007FF7A9BC2E9E lea rcx,[accum]
00007FF7A9BC2EA2 call Vec8ui::Vec8ui (07FF7A9B54B99h)
for (size_t i = 8; i < size; i += 8) {
00007FF7A9BC2EA7 mov qword ptr [rbp+98h],8
00007FF7A9BC2EB2 jmp __$EncStackInitStart+0A2h (07FF7A9BC2EC6h)
00007FF7A9BC2EB4 mov rax,qword ptr [rbp+98h]
00007FF7A9BC2EBB add rax,8
00007FF7A9BC2EBF mov qword ptr [rbp+98h],rax
00007FF7A9BC2EC6 mov rax,qword ptr [size]
00007FF7A9BC2ECD cmp qword ptr [rbp+98h],rax
00007FF7A9BC2ED4 jae __$EncStackInitStart+10Fh (07FF7A9BC2F33h)
accum = accum + accum;
00007FF7A9BC2ED6 lea rax,[rbp+4C0h]
00007FF7A9BC2EDD lea rcx,[accum]
00007FF7A9BC2EE1 mov rdi,rax
00007FF7A9BC2EE4 mov rsi,rcx
00007FF7A9BC2EE7 mov ecx,20h
00007FF7A9BC2EEC rep movs byte ptr [rdi],byte ptr [rsi]
00007FF7A9BC2EEE lea rax,[rbp+480h]
00007FF7A9BC2EF5 lea rcx,[accum]
00007FF7A9BC2EF9 mov rdi,rax
00007FF7A9BC2EFC mov rsi,rcx
00007FF7A9BC2EFF mov ecx,20h
00007FF7A9BC2F04 rep movs byte ptr [rdi],byte ptr [rsi]
00007FF7A9BC2F06 lea r8,[rbp+4C0h]
00007FF7A9BC2F0D lea rdx,[rbp+480h]
00007FF7A9BC2F14 lea rcx,[rbp+380h]
00007FF7A9BC2F1B call operator+ (07FF7A9BC29C0h)
00007FF7A9BC2F20 lea rcx,[accum]
00007FF7A9BC2F24 mov rdi,rcx
00007FF7A9BC2F27 mov rsi,rax
00007FF7A9BC2F2A mov ecx,20h
00007FF7A9BC2F2F rep movs byte ptr [rdi],byte ptr [rsi]
}
00007FF7A9BC2F31 jmp __$EncStackInitStart+90h (07FF7A9BC2EB4h)
size_t result = horizontal_max(accum);
00007FF7A9BC2F33 lea rax,[rbp+500h]
00007FF7A9BC2F3A lea rcx,[accum]
00007FF7A9BC2F3E mov rdi,rax
00007FF7A9BC2F41 mov rsi,rcx
00007FF7A9BC2F44 mov ecx,20h
00007FF7A9BC2F49 rep movs byte ptr [rdi],byte ptr [rsi]
00007FF7A9BC2F4B lea rcx,[rbp+500h]
00007FF7A9BC2F52 call horizontal_max<Vec8ui> (07FF7A9B54FB3h)
00007FF7A9BC2F57 mov eax,eax
00007FF7A9BC2F59 mov qword ptr [result],rax
Using intrinsics (we get AVX2 instructions):
const __m256i incr2 = _mm256_set1_epi32(8);
00007FF7A9BC2F60 vmovdqu ymm0,ymmword ptr [__ymm@0000000800000008000000080000000800000008000000080000000800000008 (07FF7A9E87940h)]
00007FF7A9BC2F68 vmovdqu ymmword ptr [rbp+3C0h],ymm0
00007FF7A9BC2F70 vmovdqu ymm0,ymmword ptr [rbp+3C0h]
00007FF7A9BC2F78 vmovdqu ymmword ptr [incr2],ymm0
__m256i accum2 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
00007FF7A9BC2F80 vmovdqu ymm0,ymmword ptr [__ymm@0000000700000006000000050000000400000003000000020000000100000000 (07FF7A9E87900h)]
00007FF7A9BC2F88 vmovdqu ymmword ptr [rbp+400h],ymm0
00007FF7A9BC2F90 vmovdqu ymm0,ymmword ptr [rbp+400h]
00007FF7A9BC2F98 vmovdqu ymmword ptr [accum2],ymm0
for (size_t i = 8; i < size; i += 8) {
00007FF7A9BC2FA0 mov qword ptr [rbp+158h],8
00007FF7A9BC2FAB jmp __$EncStackInitStart+19Bh (07FF7A9BC2FBFh)
00007FF7A9BC2FAD mov rax,qword ptr [rbp+158h]
00007FF7A9BC2FB4 add rax,8
00007FF7A9BC2FB8 mov qword ptr [rbp+158h],rax
00007FF7A9BC2FBF mov rax,qword ptr [size]
00007FF7A9BC2FC6 cmp qword ptr [rbp+158h],rax
00007FF7A9BC2FCD jae __$EncStackInitStart+1D5h (07FF7A9BC2FF9h)
accum2 = _mm256_add_epi32(accum2, incr2);
00007FF7A9BC2FCF vmovdqu ymm0,ymmword ptr [accum2]
00007FF7A9BC2FD7 vpaddd ymm0,ymm0,ymmword ptr [incr2]
00007FF7A9BC2FDF vmovdqu ymmword ptr [rbp+440h],ymm0
00007FF7A9BC2FE7 vmovdqu ymm0,ymmword ptr [rbp+440h]
00007FF7A9BC2FEF vmovdqu ymmword ptr [accum2],ymm0
}
00007FF7A9BC2FF7 jmp __$EncStackInitStart+189h (07FF7A9BC2FADh)
__declspec(align(32)) int32_t values_array[8];
_mm256_store_si256((__m256i*)values_array, accum2);
00007FF7A9BC2FF9 vmovdqu ymm0,ymmword ptr [accum2]
00007FF7A9BC3001 vmovdqa ymmword ptr [values_array],ymm0
size_t result2 = values_array[0];
00007FF7A9BC3009 mov eax,4
00007FF7A9BC300E imul rax,rax,0
00007FF7A9BC3012 movsxd rax,dword ptr values_array[rax]
00007FF7A9BC301A mov qword ptr [result2],rax
for (int i = 1; i < 8; i++) {
00007FF7A9BC3021 mov dword ptr [rbp+1D4h],1
00007FF7A9BC302B jmp __$EncStackInitStart+217h (07FF7A9BC303Bh)
00007FF7A9BC302D mov eax,dword ptr [rbp+1D4h]
00007FF7A9BC3033 inc eax
00007FF7A9BC3035 mov dword ptr [rbp+1D4h],eax
00007FF7A9BC303B cmp dword ptr [rbp+1D4h],8
00007FF7A9BC3042 jge __$EncStackInitStart+250h (07FF7A9BC3074h)
if (values_array[i] > result2) {
00007FF7A9BC3044 movsxd rax,dword ptr [rbp+1D4h]
00007FF7A9BC304B movsxd rax,dword ptr values_array[rax*4]
00007FF7A9BC3053 cmp rax,qword ptr [result2]
00007FF7A9BC305A jbe __$EncStackInitStart+24Eh (07FF7A9BC3072h)
result2 = values_array[i];
00007FF7A9BC305C movsxd rax,dword ptr [rbp+1D4h]
00007FF7A9BC3063 movsxd rax,dword ptr values_array[rax*4]
00007FF7A9BC306B mov qword ptr [result2],rax