Solution found, edit3;
This is a simple reproducible example, I am unable to figure this one out;
I don't really understand this behavior yet, but I bet it is well documented;
Yet I am baffled I managed to fuck something so simple up;
#include <stdint.h>
int main(int argc, char** argv) {
// Goal: Convert 8bit data to 16bit
static const uint8_t ushorta[32] = {
0x00, 0xFF, 0x01, 0xFF,
0x02, 0xFF, 0x03, 0xFF,
0x04, 0xFF, 0x05, 0xFF,
0x06, 0xFF, 0x07, 0xFF,
0x08, 0xFF, 0x09, 0xFF,
0x0A, 0xFF, 0x0B, 0xFF,
0x0C, 0xFF, 0x0D, 0xFF,
0x0E, 0xFF, 0x0F, 0xFF
};
static const uint8_t ushortb[32] = {
0x10, 0xFF, 0x11, 0xFF,
0x12, 0xFF, 0x13, 0xFF,
0x14, 0xFF, 0x15, 0xFF,
0x16, 0xFF, 0x17, 0xFF,
0x18, 0xFF, 0x19, 0xFF,
0x1A, 0xFF, 0x1B, 0xFF,
0x1C, 0xFF, 0x1D, 0xFF,
0x1E, 0xFF, 0x1F, 0xFF
};
static const uint8_t in[32] = {
0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F
};
__asm__("VMOVDQU (%0), %%ymm14"::"r" (ushorta) :"%ymm14");
__asm__("VMOVDQU (%0), %%ymm15"::"r" (ushortb) :"%ymm15");
__asm__("VMOVDQU (%0), %%ymm9"::"r" (in) :"%ymm9");
__asm__("int3");
// Intel:
// dst, src, mask
// AT&T:
// mask, src, dst
__asm__("vpshufb %%ymm15, %%ymm9, %%ymm12":::"%ymm12");
__asm__("vpshufb %%ymm14, %%ymm9, %%ymm9":::"%ymm9"); // Still broken when commented out
__asm__("int3");
// Expected result: First half in ymm9, 2nd half in ymm12, 8bit data converted to 16bit data
// Actual Result: Mirrored Malformed transform in ymm9 & ymm12;
}
I will move to intrinsics.h next time;
For context, data loaded into ymm14,15,9 is aligned and loads properly for me;
Edit:
little endian ushorta, ushortb, now ready for little endian 16bit values;
But it still doesn't work as evident in GDB/mem dumps;
The 3rd "ymm9" is inline assembly requiring to mark a register that gets trashed(overwrriten);
Edit 2:
GDB register dumps:
Program received signal SIGTRAP, Trace/breakpoint trap.
0x0000555555555146 in main ()
(gdb) p/x $ymm9
$1 = v32_int8 = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10,
0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}
(gdb) p/x $ymm14
$2 = v32_int8 = {0x0, 0xff, 0x1, 0xff,
0x2, 0xff, 0x3, 0xff, 0x4, 0xff, 0x5, 0xff, 0x6, 0xff, 0x7, 0xff, 0x8, 0xff, 0x9, 0xff, 0xa,
0xff, 0xb, 0xff, 0xc, 0xff, 0xd, 0xff, 0xe, 0xff, 0xf, 0xff}
(gdb) p/x $ymm15
$3 = v32_int8 = {0x10, 0xff, 0x11, 0xff,
0x12, 0xff, 0x13, 0xff, 0x14, 0xff, 0x15, 0xff, 0x16, 0xff, 0x17, 0xff, 0x18, 0xff, 0x19, 0xff,
0x1a, 0xff, 0x1b, 0xff, 0x1c, 0xff, 0x1d, 0xff, 0x1e, 0xff, 0x1f, 0xff}
(gdb) c
Continuing.
Program received signal SIGTRAP, Trace/breakpoint trap.
0x0000555555555151 in main ()
(gdb) p/x $ymm9
$4 = v16_int16 = {
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}
(gdb) p/x $ymm12
$5 = v16_int16 = {
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}
Edit3: Working as intended, solution found:
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
int main(int argc, char* argv[]) {
const uint8_t in[32] = {
0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F
};
__m256i first = _mm256_cvtepu8_epi16(((__m128i*)in)[0]);
__m256i second = _mm256_cvtepu8_epi16((((__m128i*)in)[1]));
short* values = (short*) &first;
short* values2 = (short*) &second;
for(int i=0; i != 16;i++) {
if(values[i] != i) {
printf("lol");
}
if(values2[i] != i+16) {
printf("lol");
}
}
return 0;
}