MSVC compiler with /arch:AVX2 /fp:fast breaks C++ matrix inversion algorithm

Question

I have matrix inversion algorithm and a 19x19 matrix for testing. Running it in Windows Subsystem for Linux with g++ produces the correct inverted matrix.

In Visual Studio 2017 on Windows it used to work as well. However after I upgraded to Visual Studio 2019, I get a matrix containing all zeros as result. Is MSVC broken or what else is wrong here?

So far I have tested replacing ==0.0/!=0.0 (that in some cases can be unsafe due to round-off) with greater/smaller than a tolerance value, but this does not work either.

The expected result (that I get with g++) is {5.26315784E-2,-1.25313282E-2, ... -1.25000000E-1}. I compile with Visual Studio 2019 v142, Windows SDK 10.0.19041.0, Release x64, and I get {0,0,...0}. With Debug x64 I get a different (also wrong) result: all matrix entries are -1.99839720E18. I use C++17 language standard, /O2 /Oi /Ot /Qpar /arch:AVX2 /fp:fast /fp:except-. I have an i7-8700K that supports AVX2.

I also marked the place where it wrongly returns in Windows in the code. Any help is greatly appreciated.

EDIT: I just found out that the cause of the wrong behavior is /arch:AVX2 in combination with /fp:fast. But I don't understand why. /arch:SSE, /arch:SSE2 and /arch:AVX with /fp:fast work correctly, but not /arch:AVX2. How can different round-off or operation order here trigger entirely different behavior?

#include <iostream>
#include <string>
#include <vector>
using namespace std;
typedef unsigned int uint;
void println(const string& s="") {
    cout << s << endl;
}

struct floatNxN {
    uint N{}; // matrix size is NxN
    vector<float> M; // matrix data
    floatNxN(const uint N, const float* M) : N {N}, M(N*N) {
        for(uint i=0; i<N*N; i++) this->M[i] = M[i];
    }
    floatNxN(const uint N, const float x=0.0f) : N {N}, M(N*N, x) { // create matrix filled with zeros
    }
    floatNxN() = default;
    ~floatNxN() = default;
    floatNxN invert() const { // returns inverse matrix
        vector<double> A(2*N*N); // calculating intermediate values as double is strictly necessary
        for(uint i=0; i<N; i++) {
            for(uint j=0; j<  N; j++) A[2*N*i+j] = (double)M[N*i+j];
            for(uint j=N; j<2*N; j++) A[2*N*i+j] = (double)(i+N==j);
        }
        for(uint k=0; k<N-1; k++) { // at iteration k==2, the content of A is already different in MSVC and g++
            if(A[2*N*k+k]==0.0) {
                for(uint i=k+1; i<N; i++) {
                    if(A[2*N*i+k]!=0.0) {
                        for(uint j=0; j<2*N; j++) {
                            const double t = A[2*N*k+j];
                            A[2*N*k+j] = A[2*N*i+j];
                            A[2*N*i+j] = t;
                        }
                        break;
                    } else if(i+1==N) {
                        return floatNxN(N);
                    }
                }
            }
            for(uint i=k+1; i<N; i++) {
                const double t = A[2*N*i+k]/A[2*N*k+k];
                for(uint j=k; j<2*N; j++) A[2*N*i+j] -= A[2*N*k+j]*t;
            }
        }
        double det = 1.0;
        for(uint k=0; k<N; k++) det *= A[2*N*k+k];
        if(det==0.0) {
            return floatNxN(N);
        }
        for(int k=N-1; k>0; k--) {
            for(int i=k-1; i>=0; i--) {
                const double t = A[2*N*i+k]/A[2*N*k+k];
                for(uint j=k; j<2*N; j++) A[2*N*i+j] -= A[2*N*k+j]*t;
            }
        }
        floatNxN r = floatNxN(N);
        for(uint i=0; i<N; i++) {
            const double t = A[2*N*i+i];
            for(uint j=0; j<N; j++) r.M[N*i+j] = (float)(A[2*N*i+N+j]/t);
        }
        return r;
    }
    string stringify() const { // converts matrix into string without spaces or newlines
        string s = "{"+to_string(M[0]);
        for(uint i=1; i<N*N; i++) s += ","+to_string(M[i]);
        return s+"}";
    }
};

int main() {
    const float Md[19*19] = {
          1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        -30,-11,-11,-11,-11,-11,-11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
         12, -4, -4, -4, -4, -4, -4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          0,  1, -1,  0,  0,  0,  0, 1,-1, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0,
          0, -4,  4,  0,  0,  0,  0, 1,-1, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0,
          0,  0,  0,  1, -1,  0,  0, 1,-1, 0, 0, 1,-1,-1, 1, 0, 0, 1,-1,
          0,  0,  0, -4,  4,  0,  0, 1,-1, 0, 0, 1,-1,-1, 1, 0, 0, 1,-1,
          0,  0,  0,  0,  0,  1, -1, 0, 0, 1,-1, 1,-1, 0, 0,-1, 1,-1, 1,
          0,  0,  0,  0,  0, -4,  4, 0, 0, 1,-1, 1,-1, 0, 0,-1, 1,-1, 1,
          0,  2,  2, -1, -1, -1, -1, 1, 1, 1, 1,-2,-2, 1, 1, 1, 1,-2,-2,
          0, -4, -4,  2,  2,  2,  2, 1, 1, 1, 1,-2,-2, 1, 1, 1, 1,-2,-2,
          0,  0,  0,  1,  1, -1, -1, 1, 1,-1,-1, 0, 0, 1, 1,-1,-1, 0, 0,
          0,  0,  0, -2, -2,  2,  2, 1, 1,-1,-1, 0, 0, 1, 1,-1,-1, 0, 0,
          0,  0,  0,  0,  0,  0,  0, 1, 1, 0, 0, 0, 0,-1,-1, 0, 0, 0, 0,
          0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,-1,-1,
          0,  0,  0,  0,  0,  0,  0, 0, 0, 1, 1, 0, 0, 0, 0,-1,-1, 0, 0,
          0,  0,  0,  0,  0,  0,  0, 1,-1,-1, 1, 0, 0, 1,-1,-1, 1, 0, 0,
          0,  0,  0,  0,  0,  0,  0,-1, 1, 0, 0, 1,-1, 1,-1, 0, 0, 1,-1,
          0,  0,  0,  0,  0,  0,  0, 0, 0, 1,-1,-1, 1, 0, 0,-1, 1, 1,-1
    };
    floatNxN M(19, Md);
    floatNxN Mm1 = M.invert();
    println(Mm1.stringify());
}

@ProjectPhysX My only guess is that your CPU doesn't have the Intel Advanced Vector Extensions 2 instructions. — Ted Lyngmo, Oct 11 '21 at 21:35
I still get the correct result with AVX2 enabled on i5-10400 (I don't know what this option is) — Barmak Shemirani, Oct 11 '21 at 21:42
With `/fp:fast /arch:AVX2` I get the same as you. Given the problems shown and "_The compiler may omit rounding at assignment statements, typecasts, or function calls. It may reorder operations or make algebraic transforms, for example, by use of associative and distributive laws. It may reorder code even if such transformations result in observably different rounding behavior._" - I would leave `/fp:fast` out of this. — Ted Lyngmo, Oct 11 '21 at 21:52
@Ted Lyngmo if rounding goes wrong and the order of operations is changed, it partly explains the oddity. Thank you, this is already a great help since I can get it working again now without /fp:fast. But I still don't get it exactly. Isn't a compiler supposed to generate the same outcome regardless of optimization, except for round-off errors? Or does different round-off here trigger entirely different behavior? — ProjectPhysX, Oct 11 '21 at 22:21
Your code needs `double` precision, I couldn't get it to work with `float` at all. Fast float goes against precision and seems to be the problem. Probably other compilers ignore the fastfloat flag for this code, but VS2019 uses it when combined with other options. You would have to modify your code to work with fastfloat, this would slow things down, or just turn this feature off. — Barmak Shemirani, Oct 12 '21 at 03:53

Sami Sallinen · Accepted Answer · 2021-10-12T04:08:00.283

It seems that it is the comparison against literal 0 that breaks the algorithm with the optimizations on. Particularly the 1st one kills it, because the result would need to be exactly 0 for the correct action.

Generally instead of comparing a floating point value against literal zero, it is better to compare the absolute value against a very small constant.

This seems to work:

#include <iostream>
#include <string>
#include <vector>
using namespace std;
typedef unsigned int uint;
void println(const string& s = "") {
  cout << s << endl;
}

struct floatNxN {
  const double epsilon = 1E-12;
  uint N{}; // matrix size is NxN
  vector<float> M; // matrix data
  floatNxN(const uint N, const float* M) : N{ N }, M(N* N) {
    for (uint i = 0; i < N * N; i++) this->M[i] = M[i];
  }
  floatNxN(const uint N, const float x = 0.0f) : N{ N }, M(N* N, x) { // create matrix filled with zeros
  }
  floatNxN() = default;
  ~floatNxN() = default;
  floatNxN invert() const { // returns inverse matrix
    vector<double> A(2 * N * N); // calculating intermediate values as double is strictly necessary
    for (uint i = 0; i < N; i++) {
      for (uint j = 0; j < N; j++) A[2 * N * i + j] = (double)M[N * i + j];
      for (uint j = N; j < 2 * N; j++) A[2 * N * i + j] = (double)(i + N == j);
    }
    for (uint k = 0; k < N - 1; k++) { // at iteration k==2, the content of A is already different in MSVC and g++
      if (fabs(A[2 * N * k + k]) < epsilon) { // comparing with 0 was the killer here
        for (uint i = k + 1; i < N; i++) {
          if (fabs(A[2 * N * i + k]) > epsilon) {
            for (uint j = 0; j < 2 * N; j++) {
              const double t = A[2 * N * k + j];
              A[2 * N * k + j] = A[2 * N * i + j];
              A[2 * N * i + j] = t;
            }
            break;
          } else if (i + 1 == N) {
            return floatNxN(N);
          }
        }
      }
      for (uint i = k + 1; i < N; i++) {
        const double t = A[2 * N * i + k] / A[2 * N * k + k];
        for (uint j = k; j < 2 * N; j++) A[2 * N * i + j] -= A[2 * N * k + j] * t;
      }
    }
    double det = 1.0;
    for (uint k = 0; k < N; k++) det *= A[2 * N * k + k];
    if (fabs(det) < epsilon) {
      return floatNxN(N);
    }
    for (int k = N - 1; k > 0; k--) {
      for (int i = k - 1; i >= 0; i--) {
        const double t = A[2 * N * i + k] / A[2 * N * k + k];
        for (uint j = k; j < 2 * N; j++) A[2 * N * i + j] -= A[2 * N * k + j] * t;
      }
    }
    floatNxN r = floatNxN(N);
    for (uint i = 0; i < N; i++) {
      const double t = A[2 * N * i + i];
      for (uint j = 0; j < N; j++) r.M[N * i + j] = (float)(A[2 * N * i + N + j] / t);
    }
    return r;
  }
  string stringify() const { // converts matrix into string without spaces or newlines
    string s = "{" + to_string(M[0]);
    for (uint i = 1; i < N * N; i++) s += "," + to_string(M[i]);
    return s + "}";
  }
};

int main() {
  const float Md[19 * 19] = {
        1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      -30,-11,-11,-11,-11,-11,-11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       12, -4, -4, -4, -4, -4, -4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0,  1, -1,  0,  0,  0,  0, 1,-1, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0,
        0, -4,  4,  0,  0,  0,  0, 1,-1, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0,
        0,  0,  0,  1, -1,  0,  0, 1,-1, 0, 0, 1,-1,-1, 1, 0, 0, 1,-1,
        0,  0,  0, -4,  4,  0,  0, 1,-1, 0, 0, 1,-1,-1, 1, 0, 0, 1,-1,
        0,  0,  0,  0,  0,  1, -1, 0, 0, 1,-1, 1,-1, 0, 0,-1, 1,-1, 1,
        0,  0,  0,  0,  0, -4,  4, 0, 0, 1,-1, 1,-1, 0, 0,-1, 1,-1, 1,
        0,  2,  2, -1, -1, -1, -1, 1, 1, 1, 1,-2,-2, 1, 1, 1, 1,-2,-2,
        0, -4, -4,  2,  2,  2,  2, 1, 1, 1, 1,-2,-2, 1, 1, 1, 1,-2,-2,
        0,  0,  0,  1,  1, -1, -1, 1, 1,-1,-1, 0, 0, 1, 1,-1,-1, 0, 0,
        0,  0,  0, -2, -2,  2,  2, 1, 1,-1,-1, 0, 0, 1, 1,-1,-1, 0, 0,
        0,  0,  0,  0,  0,  0,  0, 1, 1, 0, 0, 0, 0,-1,-1, 0, 0, 0, 0,
        0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,-1,-1,
        0,  0,  0,  0,  0,  0,  0, 0, 0, 1, 1, 0, 0, 0, 0,-1,-1, 0, 0,
        0,  0,  0,  0,  0,  0,  0, 1,-1,-1, 1, 0, 0, 1,-1,-1, 1, 0, 0,
        0,  0,  0,  0,  0,  0,  0,-1, 1, 0, 0, 1,-1, 1,-1, 0, 0, 1,-1,
        0,  0,  0,  0,  0,  0,  0, 0, 0, 1,-1,-1, 1, 0, 0,-1, 1, 1,-1
  };
  floatNxN M(19, Md);
  floatNxN Mm1 = M.invert();
  println(Mm1.stringify());
}

With this change, it worked to compile with `/fp:fast /arch:AVX2`. — Ted Lyngmo, Oct 12 '21 at 05:16
Yes this works, thank you. I have tested it with epsilon before without success. My mistake was that I tried epsilon only at 2.2E-16 and 4.4E-16, which was too small. — ProjectPhysX, Oct 12 '21 at 07:10

MSVC compiler with /arch:AVX2 /fp:fast breaks C++ matrix inversion algorithm

1 Answers1