7

After reading quite a bit about fixed-point arithmetic I think I can say I've understood the basics, unfortunately I don't know yet how to convert routines that use sin/cos/sqrt or any other fp function.

Consider this simple mcve:

#include <math.h>
#include <stdio.h>
#include <ctime>
#include <fstream>
#include <iostream>

typedef char S8;
typedef short S16;
typedef int S32;
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef float F32;
typedef double F64;

// -------- Fixed point helpers QM.N(32bits) --------
typedef S32 FP32;

#define LUT_SIZE_BITS 9  // 0xffffffff>>(32-9)=511; 32-23=9; 2^9=512
#define LUT_SIZE 512

#define FRACT_BITS 28        // Number fractional bits
#define M (1 << FRACT_BITS)  // Scaling factor

inline F32 Q2F(FP32 X) { return ((F32)X / (F32)(M)); }
inline FP32 F2Q(F32 X) { return (FP32)(X * (M)); }

const F32 PI = 3.141592653589793f;
const F32 pi = 3.141592653589793f;
const U32 WIDTH = 256;
const U32 HEIGHT = 256;

FP32 cos_table[LUT_SIZE];
FP32 sin_table[LUT_SIZE];

void init_luts() {
    const F32 deg_to_rad = PI / 180.f;
    const F32 sample_to_deg = 1 / LUT_SIZE * 360.f;
    for (S32 i = 0; i < LUT_SIZE; i++) {
        F32 rad = ((F32)i * sample_to_deg) * deg_to_rad;
        F32 c = cos(rad);
        F32 s = sin(rad);
        cos_table[i] = F2Q(c);
        sin_table[i] = F2Q(s);
    }
}

// -------- Image processing --------
U8 clamp(F32 valor) { return valor > 255 ? 255 : (valor < 0 ? 0 : (U8)valor); }

struct Pbits {
    U32 width;
    U32 height;
    U8 *data;

    Pbits(U32 width, U32 height, U8 *data) {
        this->width = width;
        this->height = height;
        this->data = data;
    }

    Pbits(Pbits *src) {
        this->width = src->width;
        this->height = src->height;
        this->data = new U8[src->width * src->height * 3];
        memcpy(this->data, src->data, width * height * 3);
    }

    ~Pbits() { delete this->data; }

    void to_bgr() {
        U8 r, g, b;
        for (S32 y = 0; y < height; y++) {
            for (S32 x = 0; x < width; x++) {
                get_pixel(y, x, r, g, b);
                set_pixel(y, x, b, g, r);
            }
        }
    }

    void get_pixel(U32 y, U32 x, U8 &r, U8 &g, U8 &b) {
        U32 offset = (y * height * 3) + (x * 3);
        r = this->data[offset + 0];
        g = this->data[offset + 1];
        b = this->data[offset + 2];
    }

    void set_pixel(U32 y, U32 x, U8 c1, U8 c2, U8 c3) {
        U32 offset = (y * height * 3) + (x * 3);

        data[offset] = c1;
        data[offset + 1] = c2;
        data[offset + 2] = c3;
    }
};

void fx1_plasma(Pbits *dst, F32 t, F32 k1, F32 k2, F32 k3, F32 k4, F32 k5, F32 k6) {
    U32 height = dst->height;
    U32 width = dst->width;

    for (U32 y = 0; y < height; y++) {
        F32 uv_y = (F32)y / height;
        for (U32 x = 0; x < width; x++) {
            F32 uv_x = (F32)x / width;

            F32 v1 = sin(uv_x * k1 + t);
            F32 v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
            F32 cx = uv_x + sin(t / k1) * k1;
            F32 cy = uv_y + sin(t / k2) * k1;
            F32 v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
            F32 vf = v1 + v2 + v3;

            U8 r = (U8)clamp(255 * cos(vf * pi));
            U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
            U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));

            dst->set_pixel(y, x, r, g, b);
        }
    }
}

// -------- Image helpers --------
inline void _write_s32(U8 *dst, S32 offset, S32 v) {
    dst[offset] = (U8)(v);
    dst[offset + 1] = (U8)(v >> 8);
    dst[offset + 2] = (U8)(v >> 16);
    dst[offset + 3] = (U8)(v >> 24);
}

void write_bmp(Pbits *src, S8 *filename) {
    Pbits *dst = new Pbits(src);
    dst->to_bgr();

    S32 w = dst->width;
    S32 h = dst->height;
    U8 *img = dst->data;

    S32 filesize = 54 + 3 * w * h;

    U8 bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
    U8 bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
    U8 bmppad[3] = {0, 0, 0};

    _write_s32(bmpfileheader, 2, filesize);
    _write_s32(bmpinfoheader, 4, w);
    _write_s32(bmpinfoheader, 8, h);

    FILE *f = fopen(filename, "wb");
    fwrite(bmpfileheader, 1, 14, f);
    fwrite(bmpinfoheader, 1, 40, f);
    for (S32 i = 0; i < h; i++) {
        fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
        fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
    }

    delete dst;
}

void write_ppm(Pbits *dst, S8 *filename) {
    std::ofstream file(filename, std::ofstream::trunc);
    if (!file.is_open()) {
        std::cout << "yep! file is not open" << std::endl;
    }
    file << "P3\n" << dst->width << " " << dst->height << "\n255\n";

    U8 r, g, b, a;
    for (U32 y = 0; y < dst->height; y++) {
        for (U32 x = 0; x < dst->width; x++) {
            dst->get_pixel(y, x, r, g, b);
            file << (S32)r << " " << (S32)g << " " << (S32)b << "\n";
        }
    }
}

S32 main() {
    Pbits *dst = new Pbits(WIDTH, HEIGHT, new U8[WIDTH * HEIGHT * 3]);

    init_luts();

    clock_t begin = clock();
    fx1_plasma(dst, 0, 8, 36, 54, 51, 48, 4);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    std::cout << "Generated plasma in " << elapsed_secs << "s" << std::endl;

    write_ppm(dst, "plasma.ppm");
    write_bmp(dst, "plasma.bmp");

    delete dst;
}

This code will generate this image:

enter image description here

QUESTION: How would you convert this floating point algorithm into a fast fixed-point one? Right now the basics of the floating point arithmetic is +/- clear to me, as in:

fa,fb=floating point values; a,b=fixed_point ones; M=scaling factor

fa = a*M
fb = b*M
fa+fb = (a+b)*M
fa-fb = (a-b)*M
fa*fb = (a*b)*M^2
fa/fb = (a/b)

but how to use sin/cos/sqrt et al in fixed-point is still eluding me. I've found this related thread but I still don't understand how to use the trigonometric luts with random fp values.

Tas
  • 7,023
  • 3
  • 36
  • 51
BPL
  • 9,632
  • 9
  • 59
  • 117
  • Which language are you using? Your _simple mcve_ is C++, but your table generation code is C. The possible answers vary greatly between the two languages. – 1201ProgramAlarm Aug 03 '19 at 04:40
  • As you can see I've tagged the thread with both c/c++, a good answer on any of these languages would be fine. – BPL Aug 03 '19 at 09:02
  • The code you show has `FRACT_BITS == 28`, but the `cos_table` values look to have been generated with `FRACT_BITS == 16`, so there's definitely something wrong. cos(0.0) == 1.0, so cos_table[0] should be 268435456. – Chris Dodd Aug 05 '19 at 05:45
  • @ChrisDodd Mmm, I'll check it out... btw, as you can see the lookup table generator functions are not using the macro specifying the QM.N format, I'll try to fix that and edit the thread, the idea is you'd be able to change those macros and everything working fine. Actually, I think I'll generate the lookup table at runtime till I've got the desirable output – BPL Aug 05 '19 at 07:35
  • I've refactored a bit the code, now it's become a little standalone single file, added a simple bpm dumper so people will find it more attractive to play with and now the LUT tables are macro-dependant and generated at runtime – BPL Aug 05 '19 at 08:45
  • It's strange that you ask for fixed point advice, but then settle for the floating point answer. It seems that your question should have been: "How do I speed this up and would fixed point help?" – Michael Dorgan Aug 06 '19 at 23:47
  • Michael Dorgan: Mmm, that's actually a good point :/. Reason why I've initially asked about fp to fixed-point conversion is because I've been told that'd be one "standard" way to speed up naive fp image processing algorithms so I was curious about learning more about this old way. At first I'd considered asking for general speed optimization but that'd derive on possible answers going down to simmd/gpu/asm etc... That said, I'm still curious if a fixed-point version will beat the nice Ismail's answer. In any case, this thread is producing already nice content, love it! – BPL Aug 07 '19 at 00:37

4 Answers4

3

The basic idea for a lookup table is simple -- you use the fixed point value as an index into an array to look up the value. The problem is if your fixed point values are large, your tables become huge. For a full table with a 32-bit FP type you need 4*232 bytes (16GB) which is impractically large. So what you generally do is use a smaller table (smaller by a factor of N) and the linearly interpolate between two values in the table to do the lookup.

In your case, you appear to want to use a 223 reduction so you need a table with just 513 elements. To do the lookup, you then use the upper 9 bits as an index into the table and use the lower 23 bits to interpolate. eg:

FP32 cos_table[513] = { 268435456, ...
FP32 cosFP32(FP32 x) {
    int i = x >> 23;  // upper 9 bits to index the table
    int fract = x & 0x7fffff;  // lower 23 bits to interpolate
    return ((int64_t)cos_table[i] * ((1 << 23) - fract) + (int64_t)cos_table[i+1] * fract + (1 << 22)) >> 23;
}

Note that we have to do the multiplies in 64 bits to avoid overflows, same as any other multiplies of FP32 values.

Since cos is symmetric, you could use that symmetry to reduce the table size by another factor of 4, and use the same table for sin, but that is more work.


If your're using C++, you can define a class with overloading to encapsulate your fixed point type:

class fixed4_28 {
    int32_t  val;
    static const int64_t fract_val = 1 << 28;
 public:
    fixed4_28 operator+(fixed4_28 a) const { a.val = val + a.val; return a; }
    fixed4_28 operator-(fixed4_28 a) const { a.val = val - a.val; return a; }
    fixed4_28 operator*(fixed4_28 a) const { a.val = ((int64_t)val * a.val) >> 28; return a; }
    fixed4_28 operator/(fixed4_28 a) const { a.val = ((int64_t)val << 28) / a.val; return a; }

    fixed4_28(double v) : val(v * fract_val + 0.5) {}
    operator double() { return (double)val / fract_val; }

    friend fixed4_28 cos(fixed_4_28);
};

inline fixed4_28 cos(fixed4_28 x) {
    int i = x.val >> 23;  // upper 9 bits to index the table
    int fract = x.val & 0x7fffff;  // lower 23 bits to interpolate
    x.val = ((int64_t)cos_table[i] * ((1 << 23) - fract) + (int64_t)cos_table[i+1] * fract + (1 << 22)) >> 23;
    return x;
}

and then your code can use this type directly and you can write equations just as if you were using float or double

Chris Dodd
  • 119,907
  • 13
  • 134
  • 226
  • Thanks, the provided code is a really simple way to encapsulate fixed point datatypes without using unnecesary templates (ie: [https://github.com/MikeLankamp/fpm/blob/master/include/fpm/fixed.h](https://github.com/MikeLankamp/fpm/blob/master/include/fpm/fixed.h) ), so I like that. Right now @ismail has been able to optimize the original slow routine by a 6x factor (still in fp space though), so unless somebody is able to come up with a faster routine than him I'll consider his answer the best one. – BPL Aug 06 '19 at 11:02
  • @BPL, what's wrong with a template solution? Also, `fpm` provides a non-templated convenience type that's 8.24. It's easy enough to define a 4.28 version in your code (Disclaimer: I'm the author of https://github.com/MikeLankamp/fpm ) – Mike.nl Oct 20 '19 at 15:27
  • @Mike.nl Hey, don't get me wrong, there isn't anything wrong about using templates at all, period. When I asked this question a while ago one of the main targets was using it eventually with some cpu texture generator that could be used in some [64k intro](https://en.wikipedia.org/wiki/64K_intro) and certainly using templates would add some extra unnecesary bytes to the final executable/library. That said, for general purpose code nothing wrong with a templatized approach like your library :) – BPL Oct 20 '19 at 16:06
  • @BPL: fair enough. It'd be cool if my library would be used, so I'm interested in reasons it isn't. And we can have a discussion on templates and size of generated code, but this is not the place ;) Anyway, do keep it in mind if you ever change your mind :) – Mike.nl Oct 20 '19 at 17:10
3

For sin() and cos() the first step is range reduction, which looks like "angle = angle % degrees_in_a_circle". Sadly, these functions typically use radians, and radians are nasty because that range reduction becomes "angle = angle % (2 * PI)", which means that precision depends on the modulo of an irrational number (which is guaranteed to be "not fun").

With this in mind; you want to throw radians in the trash and invent a new "binary degrees" such that a circle is split into "powers of 2" pieces. This means that the range reduction becomes "angle = angle & MASK;" with no precision loss (and no expensive modulo). The rest of sin() and cos() (if you're using a table driven approach) is adequately described by existing answers so I won't repeat it in this answer.

The next step is to realize that "globally fixed point" is awful. Far better is what I'll call "moving point". To understand this, consider multiplication. For "globally fixed point" you might do "result_16_16 = (x_16_16 * y_16_16) >> 16" and throw away 16 bits of precision and have to worry about overflows. For "moving point" you might do "result_32_32 = x_16_16 * y_16_16" (where the decimal point is moved) and know that there is no precision loss, know that there can't be overflow, and make it faster by avoiding a shift.

For "moving point", you'd start with the actual requirements of inputs (e.g. for a number from 0.0 to 100.0 you might start with "7.4 fixed point" with 5 bits of a uint16_t unused) and explicitly manage precision and range throughput a calculation to arrive at a result that is guaranteed to be unaffected by overflow and has the best possible compromise between "number of bits" and precision at every step.

For example:

 uint16_t inputValue_7_4 = 50 << 4;                   // inputValue is actually 50.0
 uint16_t multiplier_1_1 = 3;                         // multiplier is actually 1.5
 uint16_t k_0_5 = 28;                                 // k is actually 0.875
 uint16_t divisor_2_5 = 123;                          // divisor is actually 3.84375

 uint16_t x_8_5 = inputValue_7_4 * multiplier_1_1;    // Guaranteed no overflow and no precision loss
 uint16_t y_9_5 = x_8_5 + k+0_5;                      // Guaranteed no overflow and no precision loss
 uint32_t result_9_23 = (y_9_5 << 23) / divisor_2_5;  // Guaranteed no overflow, max. possible precision kept

I'd like to do it as "mechanically" as possible

There is no reason why "moving point" can't be done purely mechanically, if you specify the characteristics of the inputs and provide a few other annotations (the desired precision of divisions, plus either any intentional precision losses or the total bits of results); given that the rules that determine the size of the result of any operation and where the point will be in that result are easily determined. However; I don't know of an existing tool that will do this mechanical conversion, so you'd have to invent your own language for "annotated expressions" and write your own tool that converts it into another language (e.g. C). It's likely to cost less developer time to just do the conversion by hand instead.

Brendan
  • 35,656
  • 2
  • 39
  • 66
3
/*
very very fast
float sqrt2(float);

(-1) ^ s* (1 + n * 2 ^ -23)* (2 ^ (x - 127)) float
sxxxxxxxxnnnnnnnnnnnnnnnnnnnnnnn  float f
000000000000sxxxxxxxxnnnnnnnnnnn  int indis  20 bit
*/

#define LUT_SIZE2 0x000fffff   //1Mb  20 bit
float sqrt_tab[LUT_SIZE2];
#define sqrt2(f)     sqrt_tab[*(int*)&f>>12]  //float to int


int main()
{
    //init_luts();
    for (int i = 0; i < LUT_SIZE2; i++)
    {
        int ii = i << 12;        //i to float 
        sqrt_tab[i] = sqrt(*(float*)& ii);
    }

    float f=1234.5678;
    printf("test\n");
    printf(" sqrt(1234.5678)=%12.6f\n", sqrt(f));
    printf("sqrt2(1234.5678)=%12.6f\n", sqrt2(f));


    printf("\n\ntest mili second\n");
    int begin;
    int free;

    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        ;
    free = clock() - begin;
    printf("free        %4d\n", free);

    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        sqrt(f);
    printf("sqrt()      %4d\n", clock() - begin - free);


    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        sqrt2(f);
    printf("sqrt2()     %4d\n", clock() - begin - free);


    return 0;

}

/*
 sgrt(1234.5678)   35.136416
sgrt2(1234.5678)  35.135452

test mili second
free       73
sqrt()    146
sqrt2()    7
*/
  • Good idea! this is cool stuff, I'll test it out tomorrow with the optimized version, tyvm! What about the visual differences between the slow version and fast one, did you find out what was producing those? Btw, one hint, it's not necessary to create new answers for related content, as this could have been an "appendix" to your original answer ;) – BPL Aug 08 '19 at 00:31
2
#ifdef _MSC_VER
#pragma comment(lib,"opengl32.lib")
#pragma comment(lib,"glu32.lib")
#pragma warning(disable : 4996)
#pragma warning(disable : 26495)  //varsayılan değer atamıyorum
#pragma warning(disable :6031)//dönüş değeri yok sayıldı
#endif

#include <Windows.h>
#include <gl/gl.h>      //-lopengl32 
//#include <gl/glu.h>   //-lglu32

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

typedef unsigned char   U8;
typedef unsigned int    U32;

#define LUT_SIZE 1024           //1Kb  10 bit sin cos
#define LUT_SIZE2 0x000fffff    //1Mb  20 bit sqrt

float cos_tab[LUT_SIZE];
float sin_tab[LUT_SIZE];
U8    clamp_cos_tab[LUT_SIZE];
U8    clamp_sin_tab[LUT_SIZE];
float sqrt_tab[LUT_SIZE2];


const float pi = 3.141592;
const float pi_k = LUT_SIZE / (2 * pi);

const U32 WIDTH = 640;  //256
const U32 HEIGHT = 480; //256



struct Pbits;
Pbits* pdst;


U8 clamp(float f) { return f > 255 ? 255 : (f < 0 ? 0 : (U8)f); }

#define sin2(f)      sin_tab        [ (int)(pi_k * (f)) & 0x000003ff]//LUT_SIZE-1
#define cos2(f)      cos_tab        [ (int)(pi_k * (f)) & 0x000003ff]
#define clamp_sin(f) clamp_sin_tab  [ (int)(pi_k * (f)) & 0x000003ff]
#define clamp_cos(f) clamp_cos_tab  [ (int)(pi_k * (f)) & 0x000003ff]
#define sqrt2(f)     sqrt_tab       [*(int*)&(f)>>12]   //32-20 bit

void init_luts()
{
    for (int i = 0; i < LUT_SIZE; i++)
    {
        cos_tab[i] = cos(i / pi_k);
        sin_tab[i] = sin(i / pi_k);

        clamp_cos_tab[i] = clamp(255 * cos(i / pi_k));
        clamp_sin_tab[i] = clamp(255 * sin(i / pi_k));
    }

    for (int i = 0; i < LUT_SIZE2; i++)//init_luts
    {
        int ii=i<<12;       //32-20 bit
        float f = *(float *)&ii;    //i to float
        sqrt_tab[i] = sqrt(f);
    }

}



float sqrt3(float x)
{
    //https ://www.codeproject.com/Articles/69941/Best-Square-Root-Method-Algorithm-Function-Precisi
    unsigned int i = *(unsigned int*)& x;

    i += (127 << 23);
    i >>= 1;
    return *(float*)&i;
}
float sqrt4(float x)
{
    //https: //stackoverflow.com/questions/1349542/john-carmacks-unusual-fast-inverse-square-root-quake-iii
    float xhalf = 0.5f * x;
    int i = *(int*)&x;              // get bits for floating value
    i = 0x5f375a86 - (i >> 1);      // gives initial guess y0
    x = *(float*)&i;                // convert bits back to float
    x = x * (1.5f - xhalf * x * x); // Newton step, repeating increases accuracy
    return x;
}



struct Pbits
{
    int width;
    int height;
    U8* data;

    Pbits(int _width, int _height, U8* _data = 0)
    {
        width = _width;
        height = _height;
        if (!_data)
            _data = (U8*)calloc(width * height * 3, 1);
        data = _data;
    }

    ~Pbits() { free(data); }
    void set_pixel(int y, int x, U8 c1, U8 c2, U8 c3)
    {
        int offset = (y * width * 3) + (x * 3);

        data[offset] = c1;
        data[offset + 1] = c2;
        data[offset + 2] = c3;
    }
    void save(const char* filename)
    {

        U8 pp[54] = { 'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0 ,
                         40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0 };
        *(int*)(pp + 2) = 54 + 3 * width * height;
        *(int*)(pp + 18) = width;
        *(int*)(pp + 22) = height;

        int size = height * width * 3;
        U8* p = data;
        for (int i = 0; i < size; i += 3)//to_bgr()
        {
            U8 tmp = p[i];
            p[i] = p[i + 2];
            p[i + 2] = tmp;
        }

        FILE* f = fopen(filename, "wb");
        fwrite(pp, 1, 54, f);
        fwrite(data, size, 1, f);
        fclose(f);

        for (int i = 0; i < size; i += 3)//to_rgb()
        {
            U8 tmp = p[i];
            p[i] = p[i + 2];
            p[i + 2] = tmp;
        }

    }

};

void fn_plasma_slow(Pbits& dst, float t,
    float k1, float k2, float k3, float k4, float k5, float k6)
{
    int height = dst.height;
    int width = dst.width;

    for (int y = 0; y < height; y++)
    {
        float uv_y = (float)y / height;
        for (int x = 0; x < width; x++)
        {
            float uv_x = (float)x / width;

            float v1 = sin(uv_x * k1 + t);
            float v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
            float cx = uv_x + sin(t / k1) * k1;
            float cy = uv_y + sin(t / k2) * k1;
            float v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
            float vf = v1 + v2 + v3;

            U8 r = (U8)clamp(255 * cos(vf * pi));
            U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
            U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));

            dst.set_pixel(y, x, r, g, b);
        }
    }
}


void fn_plasma_fast(Pbits& dst, float t,
    float k1, float k2, float k3,
    float k4, float k5, float k6)
{

    U8* p = dst.data;

    float
        height = dst.height,
        width = dst.width,

        _k42 = pi * k4 / k2,
        _k52 = pi * k5 / k2,
        _cx = sin2(t / k1) * k1,
        _cy = sin2(t / k2) * k1,
        _x = sin2(t),
        _y = cos2(t / k2);

    for (float j = 0; j < height; j++)
        for (float i = 0; i < width; i++)
        {
            float
                x = i / width,
                y = j / height,

                v1 = sin2(k1 * x + t),
                v2 = sin2(k1 * (x * _x + y * _y) + t),

                cx = x + _cx,
                cy = y + _cy,
                aa1 = k3 * (cx * cx + cy * cy),

                v3 = sin2(sqrt2(aa1) + t),
                vf = pi * (v1 + v2 + v3);

            *p++ = clamp_cos(vf);           //red
            *p++ = clamp_sin(vf + _k42);    //green
            *p++ = clamp_cos(vf + _k52);    //blue

        }
}



void fn_plasma_fast2(Pbits& dst, float t,
    float k1, float k2, float k3,
    float k4, float k5, float k6)
{

    U8* p = dst.data;

    static float data_v1[1024];
    static float data_cx[1024];
    static float data_cy[1024];
    static float data_xx3[1024];
    static float data_yy3[1024];

    float
        height = dst.height,
        width = dst.width,

        _k42 = pi * k4 / k2,
        _k52 = pi * k5 / k2,
        _cx = sin2(t / k1) * k1,
        _cy = sin2(t / k2) * k1,
        _x = sin2(t)/width*k1 ,
        _y = cos2(t/k2)/height*k1;


    for (int x = 0; x < width; x++)
    {
        data_v1[x] = sin2(k1 * x /width+ t);

        float f = x / width + _cx;
        data_cx[x] =k3* f*f;
        data_xx3[x] = x * _x;
    }
    for (int y = 0; y < height; y++)
    {
        float f = y / height + _cy;
        data_cy[y] = k3*f * f;
        data_yy3[y] = y*_y ;
    };


    for (int y = 0; y < height; y++)
    for (int x = 0; x < width; x++)
    {

        //float v1 = data_v1[x];
        //float v2 = sin2(data_xx3[x] + data_yy3[y]);

        float aa1 = data_cx[x] + data_cy[y];

        //float     v3 = sin2(sqrt2(aa1) + t);
        //float     vf = pi * (v1 + v2 + v3);
        float vf = pi * (data_v1[x]+ sin2(data_xx3[x] + data_yy3[y])+ sin2(sqrt2(aa1) + t));

        *p++ = clamp_cos(vf);           //red
        *p++ = clamp_sin(vf + _k42);    //green
        *p++ = clamp_cos(vf + _k52);    //blue

    }
}




struct window
{
    int  x, y, width, height;       //iç x y +en boy

    HINSTANCE   hist;       //  program kaydı
    HWND        hwnd;       //  window
    HDC         hdc;        //  device context 
    HGLRC       hrc;        //  opengl context 
    //WNDPROC       fn_pencere; //  pencere fonksiyonu
    WNDCLASS    wc;         //  pencere sınıfı
    PIXELFORMATDESCRIPTOR pfd;

    window(int _width = 256, int _height = 256)
    {
        memset(this, 0, sizeof(*this));
        x = 100;
        y = 100;
        width = _width;
        height = _height;

        //HINSTANCE
        hist = GetModuleHandle(NULL);

        //WNDCLASS
        wc.lpfnWndProc = (WNDPROC)fn_window;
        wc.hInstance = hist;
        wc.hIcon = LoadIcon(0, IDI_WINLOGO);
        wc.hCursor = LoadCursor(0, IDC_ARROW);
        wc.lpszClassName = "opengl";
        RegisterClass(&wc);

        //HWND
        hwnd = CreateWindow("opengl", "test",
            WS_OVERLAPPEDWINDOW,
            x, y, width + 16, height + 39,
            NULL, NULL, hist, NULL);
        //HDC
        hdc = GetDC(hwnd);


        //PFD
        pfd.nSize = sizeof(pfd);
        pfd.nVersion = 1;
        pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
        pfd.iPixelType = PFD_TYPE_RGBA;
        pfd.cColorBits = 32;

        int  pf = ChoosePixelFormat(hdc, &pfd);
        SetPixelFormat(hdc, pf, &pfd);
        DescribePixelFormat(hdc, pf, sizeof(PIXELFORMATDESCRIPTOR), &pfd);

        //HRC
        hrc = wglCreateContext(hdc);
        wglMakeCurrent(hdc, hrc);

        ShowWindow(hwnd, SW_SHOW);
        SetFocus(hwnd);


    }
    ~window()
    {
        if (hrc)
            wglMakeCurrent(NULL, NULL),
            wglDeleteContext(hrc);
        if (hdc)    ReleaseDC(hwnd, hdc);
        if (hwnd)   DestroyWindow(hwnd);
        if (hist)   UnregisterClass("opengl", hist);
    }

    void run()
    {
        MSG         msg;
        BOOL dongu = true;

        while (dongu)
        {
            if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE))
            {
                if (msg.message == WM_QUIT) dongu = 0;
                else
                {
                    TranslateMessage(&msg);
                    DispatchMessage(&msg);
                }
            }
            else
            {
                render();
                SwapBuffers(hdc);
            }
        }
    }

    static int __stdcall fn_window(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)
    {
        switch (msg)
        {
            //case WM_CREATE:   {}  break;
            //case WM_COMMAND:  {}  break;
            //case WM_PAINT:    {}  break;
        case WM_CLOSE: {    DestroyWindow(hwnd); }break;
        case WM_DESTROY: {PostQuitMessage(0); }break;
        }
        return DefWindowProc(hwnd, msg, wParam, lParam);
    }
    static void render()
    {
        //OPENGL 1.0
        //glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);   
        //glMatrixMode(GL_PROJECTION);  glLoadIdentity();
        //glMatrixMode(GL_MODELVIEW);   glLoadIdentity();

        static float t; t += 0.02;
        fn_plasma_fast2(*pdst, t, 8, 36, 54, 51, 48, 4);//FAST

        glRasterPos3f(-1, -1, 0);
        glDrawPixels(WIDTH, HEIGHT, GL_RGB, GL_UNSIGNED_BYTE, pdst->data);

    }
};



int main()
{


    Pbits dst(WIDTH, HEIGHT);
    pdst = &dst;
    init_luts();


    int begin;

    begin = clock();
    fn_plasma_slow(dst, 0, 8, 36, 54, 51, 48, 4);
    printf("fn_plasma_slow:  %4d\n", clock() - begin );
    dst.save("plasma_slow.bmp");

    begin = clock();
    fn_plasma_fast(dst, 0, 8, 36, 54, 51, 48, 4);
    printf("fn_plasma_fast: %4d\n", clock() - begin);
    dst.save("plasma_fast.bmp");


    begin = clock();
    fn_plasma_fast2(dst, 0, 8, 36, 54, 51, 48, 4);
    printf("fn_plasma_fast2: %4d\n", clock() - begin );
    dst.save("plasma_fast2.bmp");



    window win(WIDTH, HEIGHT);
    win.run();


    return 0;
}
  • This is the type of answer I love, tyvm! Pros: 1) It's self-contained & including some nice refactorings 2) Your optimization is 6x with respect to the fp version 3) Technique used is really straightforward and easy to apply on other scenarios even if it's not a fixed-point technique. Cons: 1) [Result](https://dl.dropboxusercontent.com/s/kshyremzn4xdqkd/sublime_text_2019-08-06_12-36-09.png) won't be as smooth as the original version because your discretization 2) You're still calling sqrt in the inner loop 3) That inner loop can be optimized further – BPL Aug 06 '19 at 10:42
  • So, can you think of any solution for cons 1) & 2) . For bulletpoint1 I think it's just a matter of increasing the resolution of the LUTs but for bulletpoint2 I'm not so sure how I'd apply the same technique used in both sin&cos operations. That said, this is the best answer provided in this thread, I'll give you +1 and if no **better answer appears** in the next coming days I'll validate it & give the bounties to this one. Honestly, I love these type of answers, they're created for people who're really interested in the questions/subjects and not just giving cheap answers to earn the "prize". – BPL Aug 06 '19 at 10:47
  • Btw, I've also noticed for the same set of parameters the output of the slow & fast routines won't be visually the same, take a look [here](https://dl.dropboxusercontent.com/s/k7s541wdllnwach/sublime_text_2019-08-06_13-21-10.png) :( – BPL Aug 06 '19 at 11:22
  • 2
    A giant code dump does not make a good answer. Add some explanation of how this code answers the question, and what you've done to improve the performance. – 1201ProgramAlarm Aug 06 '19 at 14:13
  • my english is simple I set LUT_SIZE to 1024 function calls slow down time #define faster to use (because the code is long) (watch the movie in the opengl window) link ---> opengl32.lib – ismail akın çelik Aug 06 '19 at 23:43
  • @ismailakınçelik No worries about your English at all, mine is also simple... your code speaks by itself, tomorrow I'll test it out. Really nice answer!!! Keep it going ;) – BPL Aug 07 '19 at 00:39
  • Nice, your last version is a big improvement over the previous one, now we're getting 9.63x boost up with respect to the slow version. Also, interesting link the one about sqrt. I've measured all 14 different sqrt versions and averaged the results of fn_plasma_fast with each one of them, here's the [results](https://dl.dropboxusercontent.com/s/v2w8a3wbnejgbtu/2019-08-07_12-07-28.txt). Thos values are the average of computing 200 frames using fn_plasma_fast, as you can see there are almost no big gains even when using sqrt13-14. In that article there are some typos in sqrt8-sqrt12 names – BPL Aug 07 '19 at 10:10
  • That said, I think your version has become really nice right now... although there is one thing is still bothering me... while the visual quality of the algorithm now it's become smooth... why the outcome with the same set of parameters is different than the slow counterpart? Is this because floating point errors or have the math formulas changed drastically in the optimization? – BPL Aug 07 '19 at 10:12
  • new version 15 or 20 times faster sgrt2 function added 20 bit array and code optimization fn_plasma_fast2 – ismail akın çelik Aug 08 '19 at 04:39