"Big Float" Mandelbrot runs slower on GPU than CPU

Question

I just ported over my "big float" implementation over to OpenCL, but it turns out that the GPU was slower than the CPU? That's quite surprising as I thought that Mandelbrot was an embarrassingly parallel problem.

GPU
Preview render completed. Time taken: 86.031792ms. Iterations: 32
Preview render completed. Time taken: 268.726288ms. Iterations: 96
Preview render completed. Time taken: 461.045258ms. Iterations: 160
Full render completed. Time taken: 5908.507812ms. Iterations: 160

CPU
Preview render completed. Time taken: 101.530037ms. Iterations: 32
Preview render completed. Time taken: 199.125870ms. Iterations: 96
Preview render completed. Time taken: 281.195496ms. Iterations: 160
Full render completed. Time taken: 3264.943115ms. Iterations: 160

Does anyone have experience doing big floats on GPU before? Is it just too complicated to make use of the GPU efficiently, and therefore better done on the CPU?

There doesn't seem to be a way to profile OpenCL on M1 Mac so I can't really answer that for myself :(

Details:

GPU uses 32-bit ints, while CPU uses 64-bit ints. This means that the GPU needed 8 to get 256-bit numbers, while CPU only needed 4. I did this because I couldn't use inline ASM for "ADC" nor 128-bit ints in OpenCL, while I was able to do so on the CPU.
It seems to be much easier to profile and optimise for the CPU at the moment. I wonder if that would be the better option.
I've confirmed that the kernel is the bottleneck, not the CPU-GPU transfer (1ms) or blitting the image using SDL (1ms).

Code for the GPU version:

enum sign
{
    SIGN_NEG,
    SIGN_ZERO,
    SIGN_POS
};

struct __attribute__ ((packed)) fp256
{
    enum sign sign;
    CL_UINT man[8];
};

struct __attribute__ ((packed)) fp512
{
    enum sign sign;
    CL_UINT man[16];
};

struct fp256 fp_uadd256(struct fp256 a, struct fp256 b)
{
    struct fp256 c;
    char carry = 0;
    for (int i = 7; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
        carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

struct fp256 fp_usub256(struct fp256 a, struct fp256 b)
{
    struct fp256 c;
    char carry = 0;
    for (int i = 7; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] - (CL_ULONG)b.man[i] - (CL_ULONG)carry;
        carry = (char)(temp >> 63); // Check if wrapped around.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

struct fp512 fp_uadd512(struct fp512 a, struct fp512 b)
{
    struct fp512 c;
    char carry = 0;
    for (int i = 15; i >= 0; i--)
    {
        CL_ULONG temp = (CL_ULONG)a.man[i] + (CL_ULONG)b.man[i] + (CL_ULONG)carry;
        carry = (char)(temp >> 32); // Note that the highest 31 bits of temp are all 0.
        c.man[i] = (CL_UINT)temp;
    }
    return c;
}

enum cmp
{
    CMP_SAME,
    CMP_A_BIG,
    CMP_B_BIG
};

enum cmp fp_ucmp256(struct fp256 a, struct fp256 b)
{
    struct fp256 c = fp_usub256(a, b);
    bool is_negative = (c.man[0] >> 31) == 1;
    if (is_negative)
        return CMP_B_BIG;
    else
    {

// Here, we check if the difference is 0. If so, then both a and b are the same,
// but if not, then the difference must be positive, and thus a > b.

        for (int i = 0; i < 8; i++)
            if (c.man[i] != 0)
                return CMP_A_BIG;
        return CMP_SAME;
    }
}

struct fp256 fp_sadd256(struct fp256 a, struct fp256 b)
{
    if (a.sign == SIGN_ZERO && b.sign == SIGN_ZERO)
        return a;
    if (b.sign == SIGN_ZERO)
        return a;
    if (a.sign == SIGN_ZERO)
        return b;
    if ((a.sign == SIGN_POS && b.sign == SIGN_POS) ||
        (a.sign == SIGN_NEG && b.sign == SIGN_NEG))
    {
        struct fp256 c = fp_uadd256(a, b);
        c.sign = a.sign;
        return c;
    }

    //assert((a.sign == SIGN_POS && b.sign == SIGN_NEG) ||
    //       (a.sign == SIGN_NEG && b.sign == SIGN_POS));

    enum cmp cmp = fp_ucmp256(a, b);
    if (cmp == CMP_SAME)
        return (struct fp256) { SIGN_ZERO, {0} };
    
    if (a.sign == SIGN_POS && b.sign == SIGN_NEG)
    {
        if (cmp == CMP_A_BIG)
        {
            struct fp256 c = fp_usub256(a, b);
            c.sign = SIGN_POS;
            return c;
        }
        else
        {
            struct fp256 c = fp_usub256(b, a);
            c.sign = SIGN_NEG;
            return c;
        }
    }
    else
    {
        if (cmp == CMP_A_BIG)
        {
            struct fp256 c = fp_usub256(a, b);
            c.sign = SIGN_NEG;
            return c;
        }
        else
        {
            struct fp256 c = fp_usub256(b, a);
            c.sign = SIGN_POS;
            return c;
        }
    }
}

struct fp256 fp_sinv256(struct fp256 a)
{
    if (a.sign == SIGN_POS) a.sign = SIGN_NEG;
    else if (a.sign == SIGN_NEG) a.sign = SIGN_POS;
    return a;
}

struct fp256 fp_ssub256(struct fp256 a, struct fp256 b)
{
    return fp_sadd256(a, fp_sinv256(b));
}

struct fp256 fp_smul256(struct fp256 a, struct fp256 b)
{
    if (a.sign == SIGN_ZERO || b.sign == SIGN_ZERO)
        return (struct fp256) { SIGN_ZERO, {0} };

    enum sign sign;
    if (a.sign == SIGN_NEG && b.sign == SIGN_NEG)
        sign = SIGN_POS;
    else if (a.sign == SIGN_NEG || b.sign == SIGN_NEG)
        sign = SIGN_NEG;
    else
        sign = SIGN_POS;

    struct fp512 c = {0};
    for (int i = 7; i >= 0; i--) // a
    {
        for (int j = 7; j >= 0; j--) // b
        {
            int low_offset = 15 - (7 - i) - (7 - j);
            //assert(low_offset >= 1);
            int high_offset = low_offset - 1;

            CL_ULONG mult = (CL_ULONG)a.man[i] * (CL_ULONG)b.man[j];
            struct fp512 temp = {0};
            temp.man[low_offset] = (CL_UINT)mult;
            temp.man[high_offset] = mult >> 32;

            c = fp_uadd512(c, temp);
        }
    }

    struct fp256 c256;
    c256.sign = sign;
    for (int i = 1; i <= 8; i++)
        c256.man[i - 1] = c.man[i];

    return c256;
}

struct fp256 fp_ssqr256(struct fp256 a)
{
    return fp_smul256(a, a);
}

struct fp256 fp_asr256(struct fp256 a)
{
    for (int i = 7; i >= 1; i--)
    {
        a.man[i] >>= 1;
        a.man[i] |= (a.man[i - 1] & 0x1) << 31;
    }
    a.man[0] >>= 1;
    return a;
}

struct fp256 int_to_fp256(int a)
{
    if (a == 0)
        return (struct fp256){ SIGN_ZERO, {0} };
    
    struct fp256 b = {0};
    if (a < 0)
    {
        b.sign = SIGN_NEG;
        a = -a;
    }
    else
        b.sign = SIGN_POS;
    
    b.man[0] = (CL_UINT)a;
    return b;
}

But what here is parallelizable? These are almost all shift and mask operations, which your CPU can do in 1 cycle (or less). Your GPUs can run in parallel, but they're not faster than your CPU. — Tim Roberts, Dec 13 '21 at 04:59
Hi @TimRoberts, thanks for responding. From my tests on the CPU a lot of the time was spent on mul operations. I might be missing something but isn't fractal rendering very parallelizable per-pixel? — Ignis Incendio, Dec 13 '21 at 06:08
Well, yes, but it's not automatic. You have to organize your code in a way that the compiler knows it can take advantage of the parallelism. Where did you get the Mandelbrot code from? — Tim Roberts, Dec 13 '21 at 06:21
Hmm, I'm still quite confused, actually. I thought that I would just have to write the kernel such that it operates per-pixel. I wrote the Mandelbrot code myself. Here's the OpenCL kernel: https://gist.github.com/limdingwen/472c9af204df298d91aad328ffb8974f — Ignis Incendio, Dec 13 '21 at 06:25
Well, maybe I'm fundamentally wrong. I'm not as good at OpenCL as I should be. GPUs are special-purpose processors -- they are very, very good at things within their problem sphere, like doing arithmetic on single floats. I just don't think the kind of ops you're doing are a natural fit. It may be that each pixel takes much longer on GPU than CPU. but even as I'm saying that I'm not sure I buy it. — Tim Roberts, Dec 13 '21 at 06:38
Right, I kind of suspected that too; my code is honestly terrible with lots of branches, which I think GPUs might hate. I think I'll just go back to CPU and try to improve my algorithm in other ways... plus, I don't really feel like dealing with Mac's OpenCL tooling anymore :( Thanks for the help! — Ignis Incendio, Dec 13 '21 at 06:41

score 2 · Accepted Answer · answered Jan 09 '22 at 18:29

That's quite surprising as I thought that Mandelbrot was an embarrassingly parallel problem.

It is, but the mapping of work is not steady between work-items in both dimensions. Some work-items complete only in 5 iterations while some require 100 iterations. Those complete early simply start idling until the last(200 iterations?) pixel in same tile is computed.

To optimize the idle cycles during waiting on busy pixels (high-iteration pixels), you can do this:

have 1 GPU workitem take a 8x8 tile of pixels. (10k workitems for 800x800 image)
"sample" 4 corners of the tile using the GPU workitem running the kernel
if sampled iteration values are relatively close to each other, spawn a mini-kernel by that work-item (using dynamic parallelism) and compute the 8x8 region using that mini-kernel on 64 workitems (GPU threads)
if sampled iteration values are very different between each other, then have the parent workitem compute the whole 8x8 region without having any idle cycles between pixel computations.

Another option could be sending those "divergent" tiles to CPU and computing only "steady" tiles on the GPU. This way best of both worlds can be combined with relatively less wasting of GPU cycles. Or you can sort all tiles on their guessed-average-iteration values and run highest ones only on GPU and lowest ones only on CPU.

Sampling corners of a 8x8 tile takes roughly 1/16 of its total compute effort. So with the relatively cheap preprocessing, you can distribute it fairly.

"Big Float" Mandelbrot runs slower on GPU than CPU

1 Answers1