2

I have a c++ function with some SSE2 instructions. The problem is i am getting the following linker error when compiling this code using microsoft visual c++:

unresolved external symbol _m_empty referenced in function "void * __cdecl process(void *)"

And when i comment _m_empty, i'll get a runtime error! But it should use for MMX instructions, shouldn't it?

#include "mex.h"
#include <pthread.h>
#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>

#define malloc_aligned(a,b) _aligned_malloc(a,b)
#define IS_ALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)

#define NUM_FEATURES 32
#define __attribute__(A) /* do nothing */

/*
 * This code is used for computing filter responses.  It computes the
 * response of a set of filters with a feature map.  
 *
 * Multithreaded version.
 */

struct thread_data {
  float *A;
  float *B;
  double *C;
  mxArray *mxC;
  const mwSize *A_dims;
  const mwSize *B_dims;
  mwSize C_dims[2];
};

// convolve A and B
void *process(void *thread_arg) {
  thread_data *args = (thread_data *)thread_arg;
  float *A = args->A;
  float *B = args->B;
  double *C = args->C;
  const mwSize *A_dims = args->A_dims;
  const mwSize *B_dims = args->B_dims;
  const mwSize *C_dims = args->C_dims;

  __m128 a,b,c;
  double *dst = C;
  for (int x = 0; x < C_dims[1]; x++) {
    for (int y = 0; y < C_dims[0]; y++) {
      __m128 v = _mm_setzero_ps();
      const float *A_src = A + y*NUM_FEATURES + x*A_dims[0]*NUM_FEATURES;
      const float *B_src = B;
      for (int xp = 0; xp < B_dims[1]; xp++) {
        const float *A_off = A_src;
        const float *B_off = B_src;
        for (int yp = 0; yp < B_dims[0]; yp++) {
          a = _mm_load_ps(A_off+0);
          b = _mm_load_ps(B_off+0);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+4);
          b = _mm_load_ps(B_off+4);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+8);
          b = _mm_load_ps(B_off+8);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+12);
          b = _mm_load_ps(B_off+12);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+16);
          b = _mm_load_ps(B_off+16);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+20);
          b = _mm_load_ps(B_off+20);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+24);
          b = _mm_load_ps(B_off+24);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          a = _mm_load_ps(A_off+28);
          b = _mm_load_ps(B_off+28);
          c = _mm_mul_ps(a, b);
          v = _mm_add_ps(v, c);

          // N.B. Unroll me more/less if you change NUM_FEATURES

          A_off += NUM_FEATURES;
          B_off += NUM_FEATURES;
        }

        A_src += A_dims[0]*NUM_FEATURES;
        B_src += B_dims[0]*NUM_FEATURES;
      }
      // buf[] must be 16-byte aligned
      float buf[4] __attribute__ ((aligned (16)));
      _mm_store_ps(buf, v);
      _mm_empty();
      *(dst++) = buf[0]+buf[1]+buf[2]+buf[3];
    }
  }
  pthread_exit(NULL);
  return 0;
}

float *prepare(float *in, const int *dims) {
  float *F = (float *)malloc_aligned(16, dims[0]*dims[1]*NUM_FEATURES*sizeof(float));
  // Sanity check that memory is aligned
  if (!IS_ALIGNED(F))
    mexErrMsgTxt("Memory not aligned");

  float *p = F;
  for (int x = 0; x < dims[1]; x++) {
    for (int y = 0; y < dims[0]; y++) {
      for (int f = 0; f < dims[2]; f++)
        *(p++) = in[y + f*dims[0]*dims[1] + x*dims[0]];
      for (int f = dims[2]; f < NUM_FEATURES; f++)
        *(p++) = 0;
    }
  }
  return F;
}

// matlab entry point
// C = fconv(A, cell of B, start, end);
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { 
  if (nrhs != 4)
    mexErrMsgTxt("Wrong number of inputs"); 
  if (nlhs != 1)
    mexErrMsgTxt("Wrong number of outputs");

  // get A
  const mxArray *mxA = prhs[0];
  if (mxGetNumberOfDimensions(mxA) != 3 || 
      mxGetClassID(mxA) != mxSINGLE_CLASS)
    mexErrMsgTxt("Invalid input: A");

  // get B and start/end
  const mxArray *cellB = prhs[1];
  mwSize num_bs = mxGetNumberOfElements(cellB);  
  int start = (int)mxGetScalar(prhs[2]) - 1;
  int end = (int)mxGetScalar(prhs[3]) - 1;
  if (start < 0 || end >= num_bs || start > end)
    mexErrMsgTxt("Invalid input: start/end");
  int len = end-start+1;

  // start threads
  thread_data *td = (thread_data *)mxCalloc(len, sizeof(thread_data));
  pthread_t *ts = (pthread_t *)mxCalloc(len, sizeof(pthread_t));
  const mwSize *A_dims = mxGetDimensions(mxA);
  float *A = prepare((float *)mxGetPr(mxA), A_dims);
  for (int i = 0; i < len; i++) {
    const mxArray *mxB = mxGetCell(cellB, i+start);
    td[i].A_dims = A_dims;
    td[i].A = A;
    td[i].B_dims = mxGetDimensions(mxB);
    td[i].B = prepare((float *)mxGetPr(mxB), td[i].B_dims);
    if (mxGetNumberOfDimensions(mxB) != 3 ||
        mxGetClassID(mxB) != mxSINGLE_CLASS ||
        td[i].A_dims[2] != td[i].B_dims[2])
      mexErrMsgTxt("Invalid input: B");

    // compute size of output
    int height = td[i].A_dims[0] - td[i].B_dims[0] + 1;
    int width = td[i].A_dims[1] - td[i].B_dims[1] + 1;
    if (height < 1 || width < 1)
      mexErrMsgTxt("Invalid input: B should be smaller than A");
    td[i].C_dims[0] = height;
    td[i].C_dims[1] = width;
    td[i].mxC = mxCreateNumericArray(2, td[i].C_dims, mxDOUBLE_CLASS, mxREAL);
    td[i].C = (double *)mxGetPr(td[i].mxC);

    if (pthread_create(&ts[i], NULL, process, (void *)&td[i]))
      mexErrMsgTxt("Error creating thread");  
  }

  // wait for the treads to finish and set return values
  void *status;
  plhs[0] = mxCreateCellMatrix(1, len);
  for (int i = 0; i < len; i++) {
    pthread_join(ts[i], &status);
    mxSetCell(plhs[0], i, td[i].mxC);
    free(td[i].B);
  }
  mxFree(td);
  mxFree(ts);
  free(A);
}
Mbt925
  • 1,317
  • 1
  • 16
  • 31
  • 1
    The unrolling there is bad, you've kept the dependency chain on `v` so it's roughly equivalent to not unrolling. – harold Dec 16 '15 at 12:13
  • What's you suggestion? – Mbt925 Dec 16 '15 at 12:14
  • Use several different `v`'s and add them in the end. – harold Dec 16 '15 at 12:43
  • @harold But there'll be many v's!!! Is it efficient? – Mbt925 Dec 16 '15 at 13:59
  • 2
    Yes. It doesn't really cost anything to have a variable, as long as they don't have to be spilled to the stack. The point is to get more overlap between the iteration of the "not unrolled"-loop, they overlap now such that it still takes 3 cycles per iteration (same as when not unrolled at all), because of the loop carried dependency on `v`. It could run 3 times as quickly with at least 3 different independent `v`'s – harold Dec 16 '15 at 14:12
  • I got your point. But the problem now is the _mm_empty() that i want to remove or replace by something enable to compile by x64 compiler. – Mbt925 Dec 16 '15 at 14:50
  • 3
    Completely remove _mm_empty, don't replace it by anything. It's a vestigial leftover from the MMX era, nearly 20 years ago now. If something is going wrong when you leave it out, it will be completely unrelated to _mm_empty. It's a red herring. For example, are those load addresses really aligned? – harold Dec 16 '15 at 14:53
  • 1
    I think @harold nailed it - you probably have an alignment issue which only shows up when you comment out `_mm_empty` - get rid of `_mm_empty`, fix any alignment issues, and you should be fine. – Paul R Dec 16 '15 at 15:30
  • Thanks my friends. I will check the code. – Mbt925 Dec 16 '15 at 16:35
  • 2
    Is `_m_empty` a typo in your question? I thought the issue was going to be that you had that typo in your code, and it was treated as an implicitly-declared external function, creating an error at link time. **What runtime error do you get** without a `_mm_empty()` intrinsic? Does some of your other code use MMX, and this is just cleaning up from it? BTW, using `EMMS` inside the loop to allow for some scalar math looks horrible for performance. On Haswell, it's 31 uops and has a throughput of one per 13 cycles. You *don't* want to run it if you don't have to. – Peter Cordes Dec 17 '15 at 00:52
  • @PeterCordes No, it's not a typo. I updated my question with the full code. There are no MMS instructions. – Mbt925 Dec 17 '15 at 12:54
  • @PaulR I checked the code but didn't find any problem. Can you check it too please? – Mbt925 Dec 17 '15 at 12:55
  • 1
    @Mbt925: set a breakpoint in your debugger and check the alignment when you get to e.g. `_mm_load_ps`. Or just change all the aligned loads/stores to unaligned and see if that fixes things. – Paul R Dec 17 '15 at 14:52
  • Turns out [`_m_empty` is a synonym for the standard `_mm_empty`](http://stackoverflow.com/a/32414070/224132). I checked, and gcc on Ubuntu provides it, too. It's the intrinsic for the `EMMS` instruction: Empty MMX State. There's no `MMS` instruction. Did you mean to say "there are no MMX instructions?" Because `emms` doesn't have any mmx register arguments, so you won't find it the normal way. – Peter Cordes Dec 17 '15 at 19:32
  • Yes, it's a synonym for _mm_empty and yes i meant MMX, that was a typo! – Mbt925 Dec 17 '15 at 19:39

3 Answers3

2

According to this link, MMX isn't implemented for x64. Use full-blown SSE2 n x64.

1

I think the _mm_empty in your code turns into a reference to _m_empty, because they're synonyms, and your build environment still has some header with #define _mm_empty _m_empty or something.

But strangely, your build environment doesn't actually provide a definition for the intrinsic. Is there a compiler warning about it being implicitly declared? It's strange because I'd expect that a total lack of MMX support would mean the _mm_empty / _m_empty equivalency wouldn't be there either.


The runtime error is probably unrelated to this. Like Paul R pointed out in a comment, you're assuming that if you could get the unmodified source to compile, it would work. That's probably not the case, because that _mm_empty looks unnecessary to the x86 asm experts (including me) that have commented on this question.

I think Paul R's guess that you might have an unaligned pointer somewhere sounds reasonable. Is the alignment of any of those arrays dependent on the sizeof a pointer? If struct thread_data is something like:

struct thread_data {
    some_type *ptr1;
    some_type *ptr2;
    int a;
    int b;
    float A[1024];
    float B[1024];
    ...
};

Then 32bit builds would have 16B-aligned arrays, but 64bit builds wouldn't.

So debug your runtime error, and find out what it is. We can't help you if all you tell us is "runtime error". If you'd told us what it was in the first place, we could tell you one way or the other whether it might be related to removing a _mm_empty.

Community
  • 1
  • 1
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
0

Thanks to harold, Paul and Peter, i found the problem! You were right, the runtime error was unrelated to _mm_empty! The problem was _aligned_malloc input parameters order. The runtime error disappeared when i swapped the inputs.

Another mistake was the free() function. _aligned_free() have to be used for freeing up aligned memory.

As harold suggested, i changed the main loop to use three independent v's. If i did wrong, correct me please. Now the program (not the function!) runs 300ms faster (2.4s -> 2.1s).

void *process(void *thread_arg) {
  thread_data *args = (thread_data *)thread_arg;
  float *A = args->A;
  float *B = args->B;
  double *C = args->C;
  const mwSize *A_dims = args->A_dims;
  const mwSize *B_dims = args->B_dims;
  const mwSize *C_dims = args->C_dims;

  __m128 a,b,c;
  double *dst = C;
  for (int x = 0; x < C_dims[1]; x++) {
    for (int y = 0; y < C_dims[0]; y++) {
      __m128 v = _mm_setzero_ps(), v1 = _mm_setzero_ps(), v2 = _mm_setzero_ps(), v3 = _mm_setzero_ps();     
      const float *A_src = A + y*NUM_FEATURES + x*A_dims[0]*NUM_FEATURES;
      const float *B_src = B;

      for (int xp = 0; xp < B_dims[1]; xp++) {
        const float *A_off = A_src;
        const float *B_off = B_src;
        for (int yp = 0; yp < B_dims[0]; yp++) {
          a = _mm_load_ps(A_off+0);
          b = _mm_load_ps(B_off+0);
          c = _mm_mul_ps(a, b);
          v1 = _mm_add_ps(v1, c);

          a = _mm_load_ps(A_off+4);
          b = _mm_load_ps(B_off+4);
          c = _mm_mul_ps(a, b);
          v2 = _mm_add_ps(v2, c);

          a = _mm_load_ps(A_off+8);
          b = _mm_load_ps(B_off+8);
          c = _mm_mul_ps(a, b);
          v3 = _mm_add_ps(v3, c);

          a = _mm_load_ps(A_off+12);
          b = _mm_load_ps(B_off+12);
          c = _mm_mul_ps(a, b);
          v1 = _mm_add_ps(v1, c);

          a = _mm_load_ps(A_off+16);
          b = _mm_load_ps(B_off+16);
          c = _mm_mul_ps(a, b);
          v2 = _mm_add_ps(v2, c);

          a = _mm_load_ps(A_off+20);
          b = _mm_load_ps(B_off+20);
          c = _mm_mul_ps(a, b);
          v3 = _mm_add_ps(v3, c);

          a = _mm_load_ps(A_off+24);
          b = _mm_load_ps(B_off+24);
          c = _mm_mul_ps(a, b);
          v1 = _mm_add_ps(v1, c);

          a = _mm_load_ps(A_off+28);
          b = _mm_load_ps(B_off+28);
          c = _mm_mul_ps(a, b);
          v2 = _mm_add_ps(v2, c);

          // N.B. Unroll me more/less if you change NUM_FEATURES

          A_off += NUM_FEATURES;
          B_off += NUM_FEATURES;
        }
        A_src += A_dims[0]*NUM_FEATURES;
        B_src += B_dims[0]*NUM_FEATURES;
      }
        v = _mm_add_ps(v, v1);
        v = _mm_add_ps(v, v2);
        v = _mm_add_ps(v, v3);

      // buf[] must be 16-byte aligned
      __declspec(align(16)) float buf[4];
      _mm_store_ps(buf, v);

      *(dst++) = buf[0]+buf[1]+buf[2]+buf[3];
    }
  }
  pthread_exit(NULL);
  return 0;
}
Mbt925
  • 1,317
  • 1
  • 16
  • 31