0

I'm trying to convert a function that finds the nth root in C for a double value from the following link http://rosettacode.org/wiki/Nth_root#C to find the nth root for 8 floats at once using AVX.

Part of that code uses DBL_EPSILON * 10. However, when I convert this to use float with AVX I have to use FLT_EPSILON*1000 or the code hangs and does not converge. When I print out FLT_EPSILON I see it is order 1E-7. But this link, http://www.cplusplus.com/reference/cfloat/ , says it should be 1E-5. When I print out DBL_EPSILON it's 1E-16 but the link says it should only be 1E-9. What's going on?

Here is the code so far (not fully optimized).

#include <stdio.h>
#include <float.h>
#include <immintrin.h>                 // AVX

inline double abs_(double x) { 
    return x >= 0 ? x : -x; 
}

double pow_(double x, int e)
{
    double ret = 1;
    for (ret = 1; e; x *= x, e >>= 1) {
        if ((e & 1)) ret *= x;
    }
    return ret;
}

double root(double a, int n)
{
    double d, x = 1;
    x = a/n;
    if (!a) return 0;
    //if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */

    int cnt = 0;
    do {
        cnt++;  
        d = (a / pow_(x, n - 1) - x) / n;
        x+= d;
    } while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
    printf("%d\n", cnt);    

    return x;
}


__m256 pow_avx(__m256 x, int e) {
    __m256 ret = _mm256_set1_ps(1.0f);
    for (; e; x = _mm256_mul_ps(x,x), e >>= 1) {
        if ((e & 1)) ret = _mm256_mul_ps(x,ret);
    }
    return ret;
}

inline __m256 abs_avx (__m256 x) { 
    return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), x), x);
    //return x >= 0 ? x : -x; 
}

int get_mask(const __m256 d, const __m256 x) {
    __m256 ad = abs_avx(d);
    __m256 ax = abs_avx(x);
    __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(ad, ax, _CMP_GT_OQ));
    return _mm_movemask_epi8(_mm256_castsi256_si128(mask)) + _mm_movemask_epi8(_mm256_extractf128_si256(mask,1));
}

__m256 root_avx(__m256 a, int n) {
    printf("%e\n", FLT_EPSILON);
    printf("%e\n", DBL_EPSILON);
    printf("%e\n", FLT_EPSILON*1000.0f);
    __m256 d;
    __m256 x = _mm256_set1_ps(1.0f);
    //if (!a) return 0;
    //if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */
    __m256 in = _mm256_set1_ps(1.0f/n);
    __m256 xtmp;
    do {
        d = _mm256_rcp_ps(pow_avx(x, n - 1));
        d = _mm256_sub_ps(_mm256_mul_ps(a,d),x);
        d = _mm256_mul_ps(d,in);
        //d = (a / pow_avx(x, n - 1) - x) / n;
        x = _mm256_add_ps(x, d); //x+= d;
        xtmp =_mm256_mul_ps(x, _mm256_set1_ps(FLT_EPSILON*100.0f));
    //} while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
    } while (get_mask(d, xtmp)); 
    return x;
}

int main()
{
    __m256 d = _mm256_set1_ps(16.0f);
    __m256 out = root_avx(d, 4);
    float result[8];
    int i;
    _mm256_storeu_ps(result, out);

    for(i=0; i<8; i++) {
        printf("%f\n", result[i]);
    } printf("\n");

    //double x = 16;
    //printf("root(%g, 15) = %g\n", x, root(x, 4));

    //double x = pow_(-3.14159, 15);
    //printf("root(%g, 15) = %g\n", x, root(x, 15));
    return 0;
}
  • For single precision you definitely want `FLT_EPSILON` - the value should be 2^-23 = 1.1920929e-7 for IEEE-754 floats. – Paul R Jun 14 '13 at 14:02
  • But the scalar code for the link converges for 10*DBL_EPSILON and for float with SSE I need 1000*FLT_EPSILON for convergence, that's why I'm confused. –  Jun 14 '13 at 14:05
  • In normal scalar code floating point expressions are promoted to double precision - with SSE though there is no such promotion. – Paul R Jun 14 '13 at 14:07
  • The scalar code is using double anyway. I thought it might have something to do with 80bit precision. –  Jun 14 '13 at 14:08
  • The best accuracy I can get for the 4th root of 16 is 1.99910. Maybe there is a bug in my code. Shouldn't I be able to get it to within E-6 accuracy? –  Jun 14 '13 at 14:10
  • Try `FLT_EPSILON*10` ? – Paul R Jun 14 '13 at 14:11
  • I did, it does not converge. I have to use FLT_EPSION*1000. –  Jun 14 '13 at 14:11
  • I guess I would want to simulate the algo first to see what's going on in terms of no of iterations and convergence etc. – Paul R Jun 14 '13 at 14:13
  • I added the full code. You should be able to compile that and run if you want. I'll be back online later today or tomorrow. –  Jun 14 '13 at 14:14
  • You could compute root faster as `root(a, n) = exp(log(a)/n)` – Marat Dukhan Jun 14 '13 at 14:39
  • Okay, I'll look into that. But then I have to implement an exp and a log function in AVX. I don't have those implemented yet. –  Jun 14 '13 at 17:49

2 Answers2

3

_mm256_rcp_ps, which maps to the rcpps instruction, performs only an approximate reciprocal. The Intel 64 and IA-32 Architectures Software Developer’s Manual says its relative error may be up to 1.5•2-12. This is insufficient to cause the root finder to converge with accuracy 100*FLT_EPSILON.

You could use an exact division, such as:

d = pow_avx(x, n-1);
d = _mm256_sub_ps(_mm256_div_ps(a, d), x);

or add some refinement steps for the reciprocal estimate.

Incidentally, if your compiler supports using regular C operators with SIMD objects, consider using the regular C operators instead:

d = pow_avx(x, n-1);
d = a/d - x;
Eric Postpischil
  • 195,579
  • 13
  • 168
  • 312
  • That fixed it!. I started using _mm256_rcp_ps yesterday. I was not aware of its inaccuracy. –  Jun 14 '13 at 17:47
  • Some compilers support regular C operators with SIMD objects? I was not aware of that. Which ones do this? I use GCC, ICC, and MSVC. –  Jun 14 '13 at 19:42
  • @raxman: Apple’s version of GCC does. I do not know whether the changes to support that were ported back to the main GNU GCC, but I would think it more likely than not. Try it. – Eric Postpischil Jun 14 '13 at 20:55
  • I will give it a try. Thanks for the information. Actually, that explains why one of the operations I was using worked in GCC but required using a intrinsic which I tried it in MSVC. –  Jun 14 '13 at 21:09
1

1e-5 is simply the maximum value the C standard allows an implementation to use for FLT_EPSILON. In practice, you'll be using IEEE-754 single-precision, which has an epsilon of 2-23, which is approximately 1e-7.

Oliver Charlesworth
  • 267,707
  • 33
  • 569
  • 680
  • 1
    I don't know who downvoted you but there must be a reason that many links report 1e-5 for float.h or cfloat.h while the value is actually close to 1e-7 and you're the only person that has given an answer to explain that. –  Jun 14 '13 at 18:58