0

I need to represent floating-point numbers of very small orders (for example, 0.6745 × 2-3000) in C. It is necessary that such support was platform independent (works on both the CPU and GPU-CUDA). The big length of the significand is not required.

I can not use high-precision libraries (GMP, MPFR, etc.) because they do not work on the GPU. On the other hand CUDA does not support the long double type. Is there any solution? Is it possible to somehow implement a custom floating-point type?

phuclv
  • 37,963
  • 15
  • 156
  • 475
Konstantin Isupov
  • 199
  • 1
  • 2
  • 12

3 Answers3

1

You could work in log-space, that is represent each number as ex where x is your standard floating point type:

  • addition/subtraction (and summation more generally) can be performed using the log-sum-exp trick, i.e.

    • ex+ey = ex (1+ey-x) = ex + log(1+exp(y-x))
  • multiplication/division become addition/subtraction

    • ex × ex = ex+y
  • raising to a power is fairly straightforward

    • (ex)^(ex) = ex exp(y)
Simon Byrne
  • 7,694
  • 1
  • 26
  • 50
0

I wrote simple solution (using this work):

#include <math.h>
#include <stdint.h>

#define DOUBLE_PRECISION 53

/*  DOUBLE PRECISION FLOATING-POINT TYPE WITH EXTENDED EXPONENT     */

typedef struct Real {
    double sig;     //significand
    long exp;       //binary exponent
} real;

/*  UNION FOR DIVISION DOUBLE BY 2^POW                              */

union DubleIntUnion
{
    double      dvalue;
    uint64_t    ivalue;
};


/*  PLACE SIGNIFICAND OF REAL NUMBER IN RANGE [1, 2)            */

inline real adjust(real x){
    real y;
    y.exp = x.exp;
    y.sig = x.sig;
    if(y.sig == 0){
        y.exp = 0;
    } else if (fabs(y.sig) >= 2.0){
        y.exp = y.exp + 1;
        y.sig = y.sig / 2;
    } else if(fabs(y.sig) < 1){
        y.exp = y.exp - 1;
        y.sig = y.sig * 2;
    }
    return y;
}

/*  PLACE SIGNIFICAND OF REAL NUMBER IN RANGE [1, 2) FOR TINY NUMBER    */
/*  FOR EXAMPLE, AFTER SUBTRATION OR WHEN SET REAL FROM DOUBLE          */

inline real adjusttiny(real x){
    real y;
    y.exp = x.exp;
    y.sig = x.sig;
    while(1){
        x.exp = y.exp;
        x.sig = y.sig;
        y = adjust(x);
        if(x.exp == y.exp && x.sig == y.sig)
            break;
    }
    return y;
}

real set(double x){
    real y;
    real z;
    y.sig = x;
    y.exp = 0;
    return adjusttiny(y);
};

real set(real x){
    real y;
    y.exp = x.exp;
    y.sig = x.sig;
    return y;
};

/*  ARITHMETIC OPERATIONS   */

//divide x by 2^pow. Assert that x.exp - pow > e_min
inline double div2pow(const double x, const int pow)
{
    DubleIntUnion diu;
    diu.dvalue = x;
    diu.ivalue -= (uint64_t)pow << 52;      // subtract pow from exponent
    return diu.dvalue;
}

//summation
inline real sum(real x, real y){            
    real sum;
    int dexp = abs(x.exp - y.exp);

    if (x.exp > y.exp){
        sum.exp = x.exp;
        if(dexp <= DOUBLE_PRECISION){           
            sum.sig = div2pow(y.sig, dexp);     // divide y by 2^(x.exp - y.exp)
            sum.sig = sum.sig + x.sig;
        } else sum.sig = x.sig;
    } else if (y.exp > x.exp){
        sum.exp = y.exp;
        if(dexp <= DOUBLE_PRECISION){           
            sum.sig = div2pow(x.sig, dexp);     // divide x by 2^(y.exp - x.exp)
            sum.sig = sum.sig + y.sig;
        } else
            sum.sig = y.sig;
    } else {
        sum.exp = x.exp;
        sum.sig = x.sig + y.sig;
    }
    return adjust(sum);
}

//subtraction
inline real sub(real x, real y){            
    real sub;
    int dexp = abs(x.exp - y.exp);

    if (x.exp > y.exp){
        sub.exp = x.exp;
        if(dexp <= DOUBLE_PRECISION){           
            sub.sig = div2pow(y.sig, dexp); // divide y by 2^(x.exp - y.exp)
            sub.sig = x.sig - sub.sig;
        } else sub.sig = x.sig;
    } else if (y.exp > x.exp){
        sub.exp = y.exp;
        if(dexp <= DOUBLE_PRECISION){           
            sub.sig = div2pow(x.sig, dexp); // divide x by 2^(y.exp - x.exp)
            sub.sig = sub.sig - y.sig; 
        } else sub.sig = -y.sig;
    } else {
        sub.exp = x.exp;
        sub.sig = x.sig - y.sig;
    }
    return adjusttiny(sub);
}

//multiplication
inline real mul(real x, real y){            
    real product;
    product.exp = x.exp + y.exp;
    product.sig = x.sig * y.sig;
    return adjust(product);
}

//division
inline real div(real x, real y){            
    real quotient;
    quotient.exp = x.exp - y.exp;
    quotient.sig = x.sig / y.sig;
    return adjust(quotient);
}

At first glance is working correctly. Maybe I missed something or that implementation can be accelerated?

How I can implement functions floor and ceil on such numbers?

Konstantin Isupov
  • 199
  • 1
  • 2
  • 12
  • Because this code is C++, `abs()` is overloaded and therefore this works, as C++. But the post is tagged `C` and `abs()` works for integers and so this code fails the math portion. Suggest posting answers using the programming language of the post. – chux - Reinstate Monica Mar 19 '15 at 14:55
  • On the math side, 1) IMO, do not see the value of using the range `(1/2, 2)` rather than a single binary range `[1, 2)`. Using a fixed range (when `exp > LONG_MIN`) would simplify various operations. 2) There are lots of problems with code like `1 << x.exp - y.exp` with is UB should the difference exceed `long` bit width and 3) code like `sum.sig = sum.sig + y.sig;` with does not detect overflow. – chux - Reinstate Monica Mar 19 '15 at 15:07
  • @chux thank you. I'll try to take into account your comments and corrected implementation – Konstantin Isupov Mar 20 '15 at 08:01
  • if you're storing a separate exponent, then why don't just store the significand as an integer? double has its own exponent and adjusting the external exponent one more time seem to be inefficient – phuclv Mar 20 '15 at 08:45
  • @LưuVĩnhPhúc if store significand as an integer, I'm afraid there will be problems with rounding – Konstantin Isupov Mar 20 '15 at 08:58
  • Then round it yourself. There will be less problem handling 2 different exponents at a time – phuclv Mar 20 '15 at 09:02
0

If you need very big exponents then symmetric level-index arithmetic may fit your needs. However the accuracy is harder to predict, therefore you may need more precision for the LI (level-index) value to compensate. One common way to improve precision is double-double arithmetic which is also used much on CUDA.

There are also numerous multiprecision libraries on CUDA like CUMP

Some more information:

phuclv
  • 37,963
  • 15
  • 156
  • 475