Why can't I remove _mm_empty()?

Question

I have a c++ function with some SSE2 instructions. The problem is i am getting the following linker error when compiling this code using microsoft visual c++:

unresolved external symbol _m_empty referenced in function "void * __cdecl process(void *)"

And when i comment _m_empty, i'll get a runtime error! But it should use for MMX instructions, shouldn't it?

#include "mex.h"
#include <pthread.h>
#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>

#define malloc_aligned(a,b) _aligned_malloc(a,b)
#define IS_ALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)

#define NUM_FEATURES 32
#define __attribute__(A) /* do nothing */

/*
 * This code is used for computing filter responses.  It computes the
 * response of a set of filters with a feature map.  
 *
 * Multithreaded version.
 */

struct thread_data {
  float *A;
  float *B;
  double *C;
  mxArray *mxC;
  const mwSize *A_dims;
  const mwSize *B_dims;
  mwSize C_dims[2];
};

// convolve A and B
void *process(void *thread_arg) {
  thread_data *args = (thread_data *)thread_arg;
  float *A = args->A;
  float *B = args->B;
  double *C = args->C;
  const mwSize *A_dims = args->A_dims;
  const mwSize *B_dims = args->B_dims;
  const mwSize *C_dims = args->C_dims;

  __m128 a,b,c;
  double *dst = C;
  for (int x = 0; x < C_dims[1]; x++) {
    for (int y = 0; y < C_dims[0]; y++) {
      __m128 v = _mm_setzero_ps();
      const float *A_src = A + y*NUM_FEATURES + x*A_dims[0]*NUM_FEATURES;
      const float *B_src = B;
      for (int xp = 0; xp < B_dims[1]; xp++) {
        const float *A_off = A_src;
        const float *B_off = B_src;
        for (int yp = 0; yp < B_dims[0]; yp++) {
          a = _mm_load_ps(A_off+0);
          b = _mm_load_ps(B_off+0);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+4);
          b = _mm_load_ps(B_off+4);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+8);
          b = _mm_load_ps(B_off+8);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+12);
          b = _mm_load_ps(B_off+12);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+16);
          b = _mm_load_ps(B_off+16);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+20);
          b = _mm_load_ps(B_off+20);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+24);
          b = _mm_load_ps(B_off+24);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+28);
          b = _mm_load_ps(B_off+28);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          // N.B. Unroll me more/less if you change NUM_FEATURES

          A_off += NUM_FEATURES;
          B_off += NUM_FEATURES;
        }

        A_src += A_dims[0]*NUM_FEATURES;
        B_src += B_dims[0]*NUM_FEATURES;
      }
      // buf[] must be 16-byte aligned
      float buf[4] __attribute__ ((aligned (16)));
      _mm_store_ps(buf, v);
      _mm_empty();
      *(dst++) = buf[0]+buf[1]+buf[2]+buf[3];
    }
  }
  pthread_exit(NULL);
  return 0;
}

float *prepare(float *in, const int *dims) {
  float *F = (float *)malloc_aligned(16, dims[0]*dims[1]*NUM_FEATURES*sizeof(float));
  // Sanity check that memory is aligned
  if (!IS_ALIGNED(F))
    mexErrMsgTxt("Memory not aligned");

  float *p = F;
  for (int x = 0; x < dims[1]; x++) {
    for (int y = 0; y < dims[0]; y++) {
      for (int f = 0; f < dims[2]; f++)
        *(p++) = in[y + f*dims[0]*dims[1] + x*dims[0]];
      for (int f = dims[2]; f < NUM_FEATURES; f++)
        *(p++) = 0;
    }
  }
  return F;
}

// matlab entry point
// C = fconv(A, cell of B, start, end);
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { 
  if (nrhs != 4)
    mexErrMsgTxt("Wrong number of inputs"); 
  if (nlhs != 1)
    mexErrMsgTxt("Wrong number of outputs");

  // get A
  const mxArray *mxA = prhs[0];
  if (mxGetNumberOfDimensions(mxA) != 3 || 
      mxGetClassID(mxA) != mxSINGLE_CLASS)
    mexErrMsgTxt("Invalid input: A");

  // get B and start/end
  const mxArray *cellB = prhs[1];
  mwSize num_bs = mxGetNumberOfElements(cellB);  
  int start = (int)mxGetScalar(prhs[2]) - 1;
  int end = (int)mxGetScalar(prhs[3]) - 1;
  if (start < 0 || end >= num_bs || start > end)
    mexErrMsgTxt("Invalid input: start/end");
  int len = end-start+1;

  // start threads
  thread_data *td = (thread_data *)mxCalloc(len, sizeof(thread_data));
  pthread_t *ts = (pthread_t *)mxCalloc(len, sizeof(pthread_t));
  const mwSize *A_dims = mxGetDimensions(mxA);
  float *A = prepare((float *)mxGetPr(mxA), A_dims);
  for (int i = 0; i < len; i++) {
    const mxArray *mxB = mxGetCell(cellB, i+start);
    td[i].A_dims = A_dims;
    td[i].A = A;
    td[i].B_dims = mxGetDimensions(mxB);
    td[i].B = prepare((float *)mxGetPr(mxB), td[i].B_dims);
    if (mxGetNumberOfDimensions(mxB) != 3 ||
        mxGetClassID(mxB) != mxSINGLE_CLASS ||
        td[i].A_dims[2] != td[i].B_dims[2])
      mexErrMsgTxt("Invalid input: B");

    // compute size of output
    int height = td[i].A_dims[0] - td[i].B_dims[0] + 1;
    int width = td[i].A_dims[1] - td[i].B_dims[1] + 1;
    if (height < 1 || width < 1)
      mexErrMsgTxt("Invalid input: B should be smaller than A");
    td[i].C_dims[0] = height;
    td[i].C_dims[1] = width;
    td[i].mxC = mxCreateNumericArray(2, td[i].C_dims, mxDOUBLE_CLASS, mxREAL);
    td[i].C = (double *)mxGetPr(td[i].mxC);

    if (pthread_create(&ts[i], NULL, process, (void *)&td[i]))
      mexErrMsgTxt("Error creating thread");  
  }

  // wait for the treads to finish and set return values
  void *status;
  plhs[0] = mxCreateCellMatrix(1, len);
  for (int i = 0; i < len; i++) {
    pthread_join(ts[i], &status);
    mxSetCell(plhs[0], i, td[i].mxC);
    free(td[i].B);
  }
  mxFree(td);
  mxFree(ts);
  free(A);
}

The unrolling there is bad, you've kept the dependency chain on `v` so it's roughly equivalent to not unrolling. — harold, Dec 16 '15 at 12:13
Yes. It doesn't really cost anything to have a variable, as long as they don't have to be spilled to the stack. The point is to get more overlap between the iteration of the "not unrolled"-loop, they overlap now such that it still takes 3 cycles per iteration (same as when not unrolled at all), because of the loop carried dependency on `v`. It could run 3 times as quickly with at least 3 different independent `v`'s — harold, Dec 16 '15 at 14:12
I got your point. But the problem now is the _mm_empty() that i want to remove or replace by something enable to compile by x64 compiler. — Mbt925, Dec 16 '15 at 14:50
Completely remove _mm_empty, don't replace it by anything. It's a vestigial leftover from the MMX era, nearly 20 years ago now. If something is going wrong when you leave it out, it will be completely unrelated to _mm_empty. It's a red herring. For example, are those load addresses really aligned? — harold, Dec 16 '15 at 14:53
I think @harold nailed it - you probably have an alignment issue which only shows up when you comment out `_mm_empty` - get rid of `_mm_empty`, fix any alignment issues, and you should be fine. — Paul R, Dec 16 '15 at 15:30
Is `_m_empty` a typo in your question? I thought the issue was going to be that you had that typo in your code, and it was treated as an implicitly-declared external function, creating an error at link time. **What runtime error do you get** without a `_mm_empty()` intrinsic? Does some of your other code use MMX, and this is just cleaning up from it? BTW, using `EMMS` inside the loop to allow for some scalar math looks horrible for performance. On Haswell, it's 31 uops and has a throughput of one per 13 cycles. You *don't* want to run it if you don't have to. — Peter Cordes, Dec 17 '15 at 00:52
@PeterCordes No, it's not a typo. I updated my question with the full code. There are no MMS instructions. — Mbt925, Dec 17 '15 at 12:54
@PaulR I checked the code but didn't find any problem. Can you check it too please? — Mbt925, Dec 17 '15 at 12:55
@Mbt925: set a breakpoint in your debugger and check the alignment when you get to e.g. `_mm_load_ps`. Or just change all the aligned loads/stores to unaligned and see if that fixes things. — Paul R, Dec 17 '15 at 14:52
Turns out [`_m_empty` is a synonym for the standard `_mm_empty`](http://stackoverflow.com/a/32414070/224132). I checked, and gcc on Ubuntu provides it, too. It's the intrinsic for the `EMMS` instruction: Empty MMX State. There's no `MMS` instruction. Did you mean to say "there are no MMX instructions?" Because `emms` doesn't have any mmx register arguments, so you won't find it the normal way. — Peter Cordes, Dec 17 '15 at 19:32
Yes, it's a synonym for _mm_empty and yes i meant MMX, that was a typo! — Mbt925, Dec 17 '15 at 19:39

score 2 · Answer 1 · answered Dec 16 '15 at 09:34

2

According to this link, MMX isn't implemented for x64. Use full-blown SSE2 n x64.

answered Dec 16 '15 at 09:34

Martin Bonner supports Monica

28,528
3
51
88

I think this function uses SSE2 and not MMX. Am i right? – Mbt925 Dec 16 '15 at 09:49
But _mm_empty is an MMX function. – Martin Bonner supports Monica Dec 16 '15 at 09:53
1

I know, But I said "And when i comment _m_empty, i'll get a runtime error!". Why? – Mbt925 Dec 16 '15 at 09:58
Because you can't randomly comment out lines of your program and expect it still to work? – Lightness Races in Orbit Dec 16 '15 at 10:02
The x86-64 ISA can still run MMX instructions in 64bit mode. Are you claiming that MS's compiler doesn't recognize the intrinsics for 64bit targets? Or that Windows won't run 64bit code that uses the MMX registers? The link is dead. – Peter Cordes Dec 17 '15 at 00:47
Link worked for me just now. The answer there is " Well, these instructions are done in the form of instrinsic functions. So when the compiler sees _m_empty it would generate the code to call the instruction automatically. But AFAIK these instructions are not supported by the x64 compiler." – Martin Bonner supports Monica Dec 17 '15 at 06:22
@MartinBonner: yup, link works again now. It does sound like some people are guessing that MSVC doesn't support MMX intrinsics for x64 builds. – Peter Cordes Dec 17 '15 at 19:50

score 1 · Answer 2 · edited May 23 '17 at 10:27

I think the _mm_empty in your code turns into a reference to _m_empty, because they're synonyms, and your build environment still has some header with #define _mm_empty _m_empty or something.

But strangely, your build environment doesn't actually provide a definition for the intrinsic. Is there a compiler warning about it being implicitly declared? It's strange because I'd expect that a total lack of MMX support would mean the _mm_empty / _m_empty equivalency wouldn't be there either.

The runtime error is probably unrelated to this. Like Paul R pointed out in a comment, you're assuming that if you could get the unmodified source to compile, it would work. That's probably not the case, because that _mm_empty looks unnecessary to the x86 asm experts (including me) that have commented on this question.

I think Paul R's guess that you might have an unaligned pointer somewhere sounds reasonable. Is the alignment of any of those arrays dependent on the sizeof a pointer? If struct thread_data is something like:

struct thread_data {
    some_type *ptr1;
    some_type *ptr2;
    int a;
    int b;
    float A[1024];
    float B[1024];
    ...
};

Then 32bit builds would have 16B-aligned arrays, but 64bit builds wouldn't.

So debug your runtime error, and find out what it is. We can't help you if all you tell us is "runtime error". If you'd told us what it was in the first place, we could tell you one way or the other whether it might be related to removing a _mm_empty.

score 0 · Accepted Answer · answered Dec 18 '15 at 17:49

Thanks to harold, Paul and Peter, i found the problem! You were right, the runtime error was unrelated to _mm_empty! The problem was _aligned_malloc input parameters order. The runtime error disappeared when i swapped the inputs.

Another mistake was the free() function. _aligned_free() have to be used for freeing up aligned memory.

As harold suggested, i changed the main loop to use three independent v's. If i did wrong, correct me please. Now the program (not the function!) runs 300ms faster (2.4s -> 2.1s).

void *process(void *thread_arg) {
  thread_data *args = (thread_data *)thread_arg;
  float *A = args->A;
  float *B = args->B;
  double *C = args->C;
  const mwSize *A_dims = args->A_dims;
  const mwSize *B_dims = args->B_dims;
  const mwSize *C_dims = args->C_dims;

  __m128 a,b,c;
  double *dst = C;
  for (int x = 0; x < C_dims[1]; x++) {
    for (int y = 0; y < C_dims[0]; y++) {
      __m128 v = _mm_setzero_ps(), v1 = _mm_setzero_ps(), v2 = _mm_setzero_ps(), v3 = _mm_setzero_ps();     
      const float *A_src = A + y*NUM_FEATURES + x*A_dims[0]*NUM_FEATURES;
      const float *B_src = B;

      for (int xp = 0; xp < B_dims[1]; xp++) {
        const float *A_off = A_src;
        const float *B_off = B_src;
        for (int yp = 0; yp < B_dims[0]; yp++) {
          a = _mm_load_ps(A_off+0);
          b = _mm_load_ps(B_off+0);
          c = _mm_mul_ps(a, b);
          v1 = _mm_add_ps(v1, c);

          a = _mm_load_ps(A_off+4);
          b = _mm_load_ps(B_off+4);
          c = _mm_mul_ps(a, b);
          v2 = _mm_add_ps(v2, c);

          a = _mm_load_ps(A_off+8);
          b = _mm_load_ps(B_off+8);
          c = _mm_mul_ps(a, b);
          v3 = _mm_add_ps(v3, c);

          a = _mm_load_ps(A_off+12);
          b = _mm_load_ps(B_off+12);
          c = _mm_mul_ps(a, b);
          v1 = _mm_add_ps(v1, c);

          a = _mm_load_ps(A_off+16);
          b = _mm_load_ps(B_off+16);
          c = _mm_mul_ps(a, b);
          v2 = _mm_add_ps(v2, c);

          a = _mm_load_ps(A_off+20);
          b = _mm_load_ps(B_off+20);
          c = _mm_mul_ps(a, b);
          v3 = _mm_add_ps(v3, c);

          a = _mm_load_ps(A_off+24);
          b = _mm_load_ps(B_off+24);
          c = _mm_mul_ps(a, b);
          v1 = _mm_add_ps(v1, c);

          a = _mm_load_ps(A_off+28);
          b = _mm_load_ps(B_off+28);
          c = _mm_mul_ps(a, b);
          v2 = _mm_add_ps(v2, c);

          // N.B. Unroll me more/less if you change NUM_FEATURES

          A_off += NUM_FEATURES;
          B_off += NUM_FEATURES;
        }
        A_src += A_dims[0]*NUM_FEATURES;
        B_src += B_dims[0]*NUM_FEATURES;
      }
        v = _mm_add_ps(v, v1);
        v = _mm_add_ps(v, v2);
        v = _mm_add_ps(v, v3);

      // buf[] must be 16-byte aligned
      __declspec(align(16)) float buf[4];
      _mm_store_ps(buf, v);

      *(dst++) = buf[0]+buf[1]+buf[2]+buf[3];
    }
  }
  pthread_exit(NULL);
  return 0;
}

That didn't ping me because I wasn't in the comments to this answer, but yes, looks better - maybe try it with 4 v's too. — harold, Dec 19 '15 at 16:42

Why can't I remove _mm_empty()?

3 Answers3