1

I'm trying to write a strcmp version that takes advantage of SSE4.2 new instructions leveraging GCC intrinsics.

This is the code I have so far:

#include <stdio.h>
#include <smmintrin.h>

int main(int argc, char const *argv[])
{
    int n;
    const char str1[16] = "foo bar";
    const char str2[16] = "foo quxx";

    /* Safety check for SSE4.2 support */
    __builtin_cpu_init();
    if(__builtin_cpu_supports("sse4.2"))
        puts("Ok SSE4.2");
    else
    {
        puts("Nok SSE4.2");
        return -__LINE__;
    }

    /* Load strings into registers */
    __v16qi xmm1 = __builtin_ia32_loaddqu(str1);
    __v16qi xmm2 = __builtin_ia32_loaddqu(str2);

    /* Print to check registers were loaded correctly */
    printf("xmm1: %s\nxmm2: %s\n", (const char *) &xmm1, (const char *) &xmm2);

    /*  Perform compare */
    n = __builtin_ia32_pcmpistri128(xmm1, xmm2, (_SIDD_CMP_EQUAL_EACH | _SIDD_LEAST_SIGNIFICANT));

    /* Print result */
    printf("n: %d\n", n);

    return 0;
}

It should print the index of the first different byte, but instead it always prints 0.

I've tried to debug it for hours until I saw this in the generated assembly:

call    printf
movdqa  -64(%rbp), %xmm1
movdqa  -80(%rbp), %xmm0
pcmpistri   $8, %xmm1, %xmm0
movl    %ecx, %eax
pcmpistrm   $8, %xmm1, %xmm0
movl    %eax, -84(%rbp)
movl    -84(%rbp), %eax

According to Wikibooks in case of instructions that output the index (just like pcmpistri I'm trying to use) the result is saved in ECX register, but, if I remeber correctly, the instruction immediately following pcmpistri overrides that register with EAX!

I think that might be the bug that is driving me crazy, but I have no experience in assembly and I am probably wrong.

Anyone is experiencing this issue? Does anyone know how to solve this?

I've tried with GCC 5.4 and 6.2 under Ubuntu 16.04 (actually, bash on Windows) with either -O0, -O1 and -O2 (and obviously -msse4.2).

What makes me think it's a GCC bug is that a similar code compiled undex MSVC from Visual Studio 2017 works correctly:

#include <stdio.h>
#include <nmmintrin.h>


int main()
{
    __m128i a, b;

    const int mode = _SIDD_CMP_EQUAL_EACH | _SIDD_LEAST_SIGNIFICANT;

    a.m128i_u16[7] = 0xFFFF;
    a.m128i_u16[6] = 0xFFFF;
    a.m128i_u16[5] = 0xFFFF;
    a.m128i_u16[4] = 0xFFFF;
    a.m128i_u16[3] = 0xFFFF;
    a.m128i_u16[2] = 0xFFFF;
    a.m128i_u16[1] = 0x0001;
    a.m128i_u16[0] = 0xFFFF;

    b.m128i_u16[7] = 0x0001;
    b.m128i_u16[6] = 0x0001;
    b.m128i_u16[5] = 0x0001;
    b.m128i_u16[4] = 0x0001;
    b.m128i_u16[3] = 0x0001;
    b.m128i_u16[2] = 0x0001;
    b.m128i_u16[1] = 0x0001;
    b.m128i_u16[0] = 0x0001;

    int returnValue = _mm_cmpistri(a, b, mode);
    printf_s("%i\n", returnValue);

    return 0;
}
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Samuele Pilleri
  • 734
  • 1
  • 7
  • 17
  • 1
    Note: `return -__LINE__` is a very bad idea! POSIX only guarantees a range of `signed char` for the result of `main`. – too honest for this site May 16 '17 at 18:09
  • 2
    Why are you using the `__builtin_xxx` intrinsics with gcc ? Why are you not using the standard intrinsics, i.e. the same ones that you use in the MSVC version of the code ? – Paul R May 16 '17 at 18:47
  • @Olaf NT subsystems are an interface between the kernel and the userspace. I understand it's not a Linux kernel, but it's designed to be as much more similar as possible and it works well for the purposes of this "project" (that does not include any major kernel space interaction). @PaulR I actually found those on the [GCC official page](https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/X86-Built-in-Functions.html), you mean `_mm_cmpistri`? I actyally couldn't find a "standard" version for `__builtin_ia32_loaddqu` and different instrinsics don't play well with each other. – Samuele Pilleri May 16 '17 at 19:44
  • 2
    @SamuelePilleri: see [`_mm_lddqu_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128&expand=3043,3043). In general the intrinsics given in the [Intel intrinsics guide](https://software.intel.com/sites/landingpage/IntrinsicsGuide/) work for any compiler supporting SSE/AVX/etc (gcc, clang, ICC, MSVC, etc). – Paul R May 16 '17 at 21:31

1 Answers1

0

Yo may be surprized to discover that actually the disassembly code presents the argument list of each instruction in the reverse order ie left to rigth. So "movl %ecx, %eax" is actually "MOV eax, ecx" ! Just run your code in debug mode step by step in Instruction Level and trace the register changes.