I was reading the AES-NI White Paper and wanted to try it out myself by writing a simple demo program based on the code provided by Intel but I was getting some weird result. It works in Debug/Release x86 and Debug x64 modes but I would get some random result in Release x64 mode. I also tried it with GCC and had no such problem. After some digging it seems that MSVC is confused with the source and destination of AESIMC instruction. It generates code like aesimc xmm3,xmmword ptr[rsp+20h]
when actually xmm3
is the source and [rsp+20h]
is the destination. In x86 mode, it will generate correct code like aesimc xmm0,xmm5
movaps xmmword ptr[K4],xmm0
(Two instructions are needed since something like aesimc xmmword ptr[K4],xmm5
is invalid I think).
I'm not sure if this is indeed a compiler bug or there's something wrong with my code.
Release x64 disassembly: (check below for complete code)
K11 = _mm_aesimc_si128(K11);
K12 = _mm_aesimc_si128(K12);
00007FF6C0A717C6 66 0F 38 DB 5C 24 20 aesimc xmm3,xmmword ptr [rsp+20h]
00007FF6C0A717CD 66 0F 6F 1C 24 movdqa xmm3,xmmword ptr [rsp]
auto K14 = AES256_GENKEY_1(K12, K13, 0x40);
00007FF6C0A717D2 66 44 0F EF F9 pxor xmm15,xmm1
K13 = _mm_aesimc_si128(K13);
00007FF6C0A717D7 66 0F 38 DB 54 24 10 aesimc xmm2,xmmword ptr [rsp+10h]
auto blocks = size >> 4;
auto feedback = _mm_loadu_si128(static_cast<const __m128i *>(iVec));
00007FF6C0A717DE F3 0F 6F 12 movdqu xmm2,xmmword ptr [rdx]
00007FF6C0A717E2 66 45 0F 38 DB F6 aesimc xmm14,xmm14
00007FF6C0A717E8 66 45 0F 38 DB ED aesimc xmm13,xmm13
00007FF6C0A717EE 66 45 0F 38 DB E4 aesimc xmm12,xmm12
00007FF6C0A717F4 66 45 0F 38 DB DB aesimc xmm11,xmm11
00007FF6C0A717FA 66 45 0F 38 DB D2 aesimc xmm10,xmm10
00007FF6C0A71800 66 45 0F 38 DB C9 aesimc xmm9,xmm9
00007FF6C0A71806 66 45 0F 38 DB C0 aesimc xmm8,xmm8
00007FF6C0A7180C 66 0F 38 DB FF aesimc xmm7,xmm7
00007FF6C0A71811 66 0F 38 DB F6 aesimc xmm6,xmm6
00007FF6C0A71816 66 0F 38 DB ED aesimc xmm5,xmm5
00007FF6C0A7181B 66 0F 38 DB E4 aesimc xmm4,xmm4
{
auto lastIn = _mm_loadu_si128(static_cast<const __m128i *>(input) + i);
00007FF6C0A71820 F3 41 0F 6F 0C 00 movdqu xmm1,xmmword ptr [r8+rax]
00007FF6C0A71826 48 8D 40 10 lea rax,[rax+10h]
auto m = _mm_xor_si128(lastIn, K14);
00007FF6C0A7182A 66 0F 6F C1 movdqa xmm0,xmm1
00007FF6C0A7182E 66 41 0F EF C7 pxor xmm0,xmm15
m = _mm_aesdec_si128(m, K13);
00007FF6C0A71833 66 0F 38 DE 44 24 10 aesdec xmm0,xmmword ptr [K13]
m = _mm_aesdec_si128(m, K12);
00007FF6C0A7183A 66 0F 38 DE 44 24 20 aesdec xmm0,xmmword ptr [K12]
m = _mm_aesdec_si128(m, K11);
Complete code: (should work with both MSVC and GCC)
#include <cstdio>
#include <cstring>
#include <cstdint>
#include <cstddef>
#include <wmmintrin.h>
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <cpuid.h>
#else
#error compiler not supported
#endif
static int check_aes_support()
{
#if defined(_MSC_VER)
int info[4];
__cpuid(info, 0x01);
return info[2] & 0x2000000;
#else
unsigned int eax, ebx, ecx, edx;
__get_cpuid(0x01, &eax, &ebx, &ecx, &edx);
return ecx & 0x2000000;
#endif
}
static inline __m128i aes256_key_assist_1(__m128i key1, __m128i key2)
{
key2 = _mm_shuffle_epi32(key2, _MM_SHUFFLE(3, 3, 3, 3));
key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
return _mm_xor_si128(key1, key2);
}
static inline __m128i aes256_key_assist_2(__m128i key1, __m128i key2)
{
key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
return _mm_xor_si128(key2, _mm_shuffle_epi32(_mm_aeskeygenassist_si128(key1, 0x00),
_MM_SHUFFLE(2, 2, 2, 2)));
}
#define AES256_GENKEY_1(K1, K2, C) aes256_key_assist_1(K1, _mm_aeskeygenassist_si128(K2, C))
#define AES256_GENKEY_2(K1, K2) aes256_key_assist_2(K1, K2)
static int aes256_cbc_encrypt(const void *key, const void *iVec,
const void *input, std::size_t size, void *output)
{
if (!size || size & 0xF)
return 1;
auto K0 = _mm_loadu_si128(static_cast<const __m128i *>(key));
auto K1 = _mm_loadu_si128(static_cast<const __m128i *>(key) + 1);
auto K2 = AES256_GENKEY_1(K0, K1, 0x01);
auto K3 = AES256_GENKEY_2(K2, K1);
auto K4 = AES256_GENKEY_1(K2, K3, 0x02);
auto K5 = AES256_GENKEY_2(K4, K3);
auto K6 = AES256_GENKEY_1(K4, K5, 0x04);
auto K7 = AES256_GENKEY_2(K6, K5);
auto K8 = AES256_GENKEY_1(K6, K7, 0x08);
auto K9 = AES256_GENKEY_2(K8, K7);
auto K10 = AES256_GENKEY_1(K8, K9, 0x10);
auto K11 = AES256_GENKEY_2(K10, K9);
auto K12 = AES256_GENKEY_1(K10, K11, 0x20);
auto K13 = AES256_GENKEY_2(K12, K11);
auto K14 = AES256_GENKEY_1(K12, K13, 0x40);
auto blocks = size >> 4;
auto m = _mm_loadu_si128(static_cast<const __m128i *>(iVec));
for (decltype(blocks) i = 0; i < blocks; i++)
{
m = _mm_xor_si128(m, _mm_loadu_si128(static_cast<const __m128i *>(input) + i));
m = _mm_xor_si128(m, K0);
m = _mm_aesenc_si128(m, K1);
m = _mm_aesenc_si128(m, K2);
m = _mm_aesenc_si128(m, K3);
m = _mm_aesenc_si128(m, K4);
m = _mm_aesenc_si128(m, K5);
m = _mm_aesenc_si128(m, K6);
m = _mm_aesenc_si128(m, K7);
m = _mm_aesenc_si128(m, K8);
m = _mm_aesenc_si128(m, K9);
m = _mm_aesenc_si128(m, K10);
m = _mm_aesenc_si128(m, K11);
m = _mm_aesenc_si128(m, K12);
m = _mm_aesenc_si128(m, K13);
m = _mm_aesenclast_si128(m, K14);
_mm_storeu_si128(static_cast<__m128i *>(output) + i, m);
}
return 0;
}
static int aes256_cbc_decrypt(const void *key, const void *iVec,
const void *input, std::size_t size, void *output)
{
if (!size || size & 0xF)
return 1;
auto K0 = _mm_loadu_si128(static_cast<const __m128i *>(key));
auto K1 = _mm_loadu_si128(static_cast<const __m128i *>(key) + 1);
auto K2 = AES256_GENKEY_1(K0, K1, 0x01);
auto K3 = AES256_GENKEY_2(K2, K1);
auto K4 = AES256_GENKEY_1(K2, K3, 0x02);
auto K5 = AES256_GENKEY_2(K4, K3);
auto K6 = AES256_GENKEY_1(K4, K5, 0x04);
auto K7 = AES256_GENKEY_2(K6, K5);
auto K8 = AES256_GENKEY_1(K6, K7, 0x08);
auto K9 = AES256_GENKEY_2(K8, K7);
auto K10 = AES256_GENKEY_1(K8, K9, 0x10);
auto K11 = AES256_GENKEY_2(K10, K9);
auto K12 = AES256_GENKEY_1(K10, K11, 0x20);
auto K13 = AES256_GENKEY_2(K12, K11);
auto K14 = AES256_GENKEY_1(K12, K13, 0x40);
K1 = _mm_aesimc_si128(K1);
K2 = _mm_aesimc_si128(K2);
K3 = _mm_aesimc_si128(K3);
K4 = _mm_aesimc_si128(K4);
K5 = _mm_aesimc_si128(K5);
K6 = _mm_aesimc_si128(K6);
K7 = _mm_aesimc_si128(K7);
K8 = _mm_aesimc_si128(K8);
K9 = _mm_aesimc_si128(K9);
K10 = _mm_aesimc_si128(K10);
K11 = _mm_aesimc_si128(K11);
K12 = _mm_aesimc_si128(K12);
K13 = _mm_aesimc_si128(K13);
auto blocks = size >> 4;
auto feedback = _mm_loadu_si128(static_cast<const __m128i *>(iVec));
for (decltype(blocks) i = 0; i < blocks; i++)
{
auto lastIn = _mm_loadu_si128(static_cast<const __m128i *>(input) + i);
auto m = _mm_xor_si128(lastIn, K14);
m = _mm_aesdec_si128(m, K13);
m = _mm_aesdec_si128(m, K12);
m = _mm_aesdec_si128(m, K11);
m = _mm_aesdec_si128(m, K10);
m = _mm_aesdec_si128(m, K9);
m = _mm_aesdec_si128(m, K8);
m = _mm_aesdec_si128(m, K7);
m = _mm_aesdec_si128(m, K6);
m = _mm_aesdec_si128(m, K5);
m = _mm_aesdec_si128(m, K4);
m = _mm_aesdec_si128(m, K3);
m = _mm_aesdec_si128(m, K2);
m = _mm_aesdec_si128(m, K1);
m = _mm_aesdeclast_si128(m, K0);
m = _mm_xor_si128(m, feedback);
_mm_storeu_si128(static_cast<__m128i *>(output) + i, m);
feedback = lastIn;
}
return 0;
}
int main()
{
auto aesSupport = check_aes_support();
std::printf("AES: %s\n", aesSupport ? "yes" : "no");
if (!aesSupport)
return -1;
std::uint64_t data[] = {0x1122334455667788, 0xAABBCCDDEEFFBBAA, 0xAAAAAAAAAAAAAAAA, 0x4444333333333333};
std::uint64_t key[] = {0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x1111111111111111, 0x1111111111111111};
std::uint64_t iVec[] = {0x123456789ABCDEF0, 0x0FEDCBA987654321};
std::uint64_t cipher[4] = {0};
aes256_cbc_encrypt(key, iVec, data, sizeof(data), cipher);
std::printf("0x%016llX 0x%016llX 0x%016llX 0x%016llX\n", cipher[0], cipher[1], cipher[2], cipher[3]);
std::memset(data, 0, sizeof(data));
aes256_cbc_decrypt(key, iVec, cipher, sizeof(data), data);
std::printf("0x%016llX 0x%016llX 0x%016llX 0x%016llX\n", data[0], data[1], data[2], data[3]);
}
It should output:
0xCF8A4156843F0A3E 0x04D4BB63524324E6 0xAAB88C080DB40B2F 0xCC346B02BA6B16E8
0x1122334455667788 0xAABBCCDDEEFFBBAA 0xAAAAAAAAAAAAAAAA 0x4444333333333333
But I would get something random in Release x64 mode:
0xCF8A4156843F0A3E 0x04D4BB63524324E6 0xAAB88C080DB40B2F 0xCC346B02BA6B16E8
0xEE64C4650D902107 0x0D03C7FA41AA930B 0x257F65FF49A99474 0xFACB372EDED13BAA