1

I'm borrowing some code from the VLC to my video player, written in MSVC++ 2010, and cannot find equivalent to its inline asms, related to extracting decoded video frame from the GPU memory to the conventional memory. Particularly, I don't know how to translate this instruction:

movq   %%xmm1,   8(%[dst1])

which can be seen in the function SplitUV in the file vlc/modules/codec/avcodec/copy.c.

According to MSDN, intrinsics for movq are _mm_move_epi64, _mm_loadl_epi64 and _mm_storel_epi64. However, they require __m128i arguments, and if I add 1 to the pointer to __m128i, I'll get the offset of 16 bytes, while I need 8 bytes.

The whole assembler code is the following:

for (x = 0; x < (width & ~31); x += 32) {
  asm volatile (
    "movdqu (%[shuffle]), %%xmm7\n"
    "movdqa  0(%[src]), %%xmm0\n"
    "movdqa 16(%[src]), %%xmm1\n"
    "movdqa 32(%[src]), %%xmm2\n"
    "movdqa 48(%[src]), %%xmm3\n"
    "pshufb  %%xmm7, %%xmm0\n"
    "pshufb  %%xmm7, %%xmm1\n"
    "pshufb  %%xmm7, %%xmm2\n"
    "pshufb  %%xmm7, %%xmm3\n"
    "movq   %%xmm0,   0(%[dst1])\n"
    "movq   %%xmm1,   8(%[dst1])\n"
    "movhpd %%xmm0,   0(%[dst2])\n"
    "movhpd %%xmm1,   8(%[dst2])\n"
    "movq   %%xmm2,  16(%[dst1])\n"
    "movq   %%xmm3,  24(%[dst1])\n"
    "movhpd %%xmm2,  16(%[dst2])\n"
    "movhpd %%xmm3,  24(%[dst2])\n"
    : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), 
        [shuffle]"r"(shuffle) 
    : "memory"
 );
 ...
}

I've started translating, line by line, and by now have the following code (incomplete):

__m128i x0, x1, x2, x3, x7;
__m128i *pshuffle128 = (__m128i *)shuffle;
__m128i *pSrc = (__m128i *)src;

for (x = 0; x < (width & ~31); x += 32) {
    __m128i *dst1 = (__m128i *)dstu + x;
    __m128i *dst2 = (__m128i *)dstv + x; 
    x7 = _mm_loadu_si128(pshuffle128);  //    "movdqu (%[shuffle]), %%xmm7\n"
    x0 = _mm_load_si128(pSrc + 0);      //    "movdqa  0(%[src]),   %%xmm0\n"
    x1 = _mm_load_si128(pSrc + 1);      //    "movdqa 16(%[src]),   %%xmm1\n"
    x2 = _mm_load_si128(pSrc + 2);      //    "movdqa 32(%[src]),   %%xmm2\n"
    x3 = _mm_load_si128(pSrc + 3);      //    "movdqa 48(%[src]),   %%xmm3\n"
    x0 = _mm_shuffle_epi8(x0, x7);      //    "pshufb  %%xmm7, %%xmm0\n"
    x1 = _mm_shuffle_epi8(x1, x7);      //    "pshufb  %%xmm7, %%xmm1\n"
    x2 = _mm_shuffle_epi8(x2, x7);      //    "pshufb  %%xmm7, %%xmm2\n"
    x3 = _mm_shuffle_epi8(x3, x7);      //    "pshufb  %%xmm7, %%xmm3\n"
    _mm_storel_epi64(dst1 + 0, x0);     //    "movq   %%xmm0,   0(%[dst1])\n"

The next instruction would be that

movq   %%xmm1,   8(%[dst1])

and I don't know how to specify offset of 8 bytes. Also, I have some doubts that I've correctly translated PSHUFB.

Would be very grateful for comments and suggestions.

Thanks.

wl2776
  • 4,099
  • 4
  • 35
  • 77
  • I'm not an expert on this stuff, but I believe there are intrinsics that are shared by both compilers, you should aim to use those. – Matt Joiner Jan 11 '12 at 15:19
  • No, there's no such thing. MSVC for x86 can also use inline asm, but MSVC for x64 - can't. – wl2776 Jan 11 '12 at 15:21
  • Meanwhile, there are 2 errors. (1) I've switched dstu and dstv (2) pSrc must be assigned in the cycle body to the value (__m128i *)(src+2*x) – wl2776 Jan 12 '12 at 14:41

1 Answers1

2

Simply use a char* pointer for dst that you can increment by 8 and cast it to __m128i* in the call to _mm_storel_epi64 similar to how it's done here (search for "_mm_storel_epi64" on that page).

sschuberth
  • 28,386
  • 6
  • 101
  • 146