2

I am working on optimizing my Mandelbrot renderer in pyOpenCL and want to split the iterations in chunks so i can better utilize my GPU.
Example with max iterations=1000 and 2 "chunks":
1. Run the mandelbrot escape algorithm for iterations 0-500.
2. Save the needed iterations for every point where iterations < 500 and run again for all other points with iterations from 500 - 1000.

The first loop works like expected but every chunk after that leads to wrong results. I'd really like to be more specific but i don't know where the real problem lies (starring at the code for > 2 days now).
I suspect something going wrong when copying the old x,y (real, imaginary) parts from the kernel, but i am out of ideas how to debug it.
I get the same results running on my GPU and CPU so i'd guess its nothing GPU specific.

Example image with iterations=2000 and 10 chunks:
enter image description here

this is almost only the first chunk (plus some few "wrong" pixel).
All done in one chunk (iterations=200 and 1 chunk):
enter image description here

And the expected result with iterations=2000 and chunks = 1:
enter image description here

import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
    mf = cl.mem_flags
    cl_queue = cl.CommandQueue(ctx)
    # build program
    code = """
    #if real_t == double
        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #endif
    kernel void mandel(
        __global real_t *coords,
        __global uint *output,
        __global real_t *output_coord,
        const uint max_iter,
        const uint start_iter    
    ){
        uint id = get_global_id(0);         
        real_t2 my_coords = vload2(id, coords);           
        real_t x = my_coords.x;
        real_t y = my_coords.y;
        uint iter = 0;
        for(iter=start_iter; iter<max_iter; ++iter){
            if(x*x + y*y > 4.0f){
                break;
            }
            real_t xtemp = x*x - y*y + my_coords.x;
            y = 2*x*y + my_coords.y;
            x = xtemp;
        }
        // copy the current x,y pair back
        real_t2 val = (real_t2){x, y};
        vstore2(val, id, output_coord);
        output[id] = iter;
    }        
    """
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))

    # Calculate the "viewport".
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.))

    # Create index map in x,y pairs
    xx = np.arange(0, width, 1, dtype=np.uint32)
    yy = np.arange(0, height, 1, dtype=np.uint32)
    index_map = np.dstack(np.meshgrid(xx, yy))
    # and local "coordinates" (real, imaginary parts)
    coord_map = np.ndarray(index_map.shape, dtype=_nptype)
    coord_map[:] = index_map
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
    coord_map[:] += (_nptype(x0), _nptype(y0))
    coord_map = coord_map.flatten()
    index_map = index_map.flatten().astype(dtype=np.uint32)
    # Create input and output buffer
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
    buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes)
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))

    start_max_iter = 0
    to_do = coord_map.size / 2
    steps_size = int(max_iter / float(iter_steps))
    while to_do > 0 and start_max_iter < max_iter:
        end_max_iter = min(max_iter, start_max_iter + steps_size )
        print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)

        # copy x/y pairs to device 
        cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()        
        # and finally call the ocl function
        prg.mandel(cl_queue, (to_do,), None,
            buffer_in_cl,                   
            buffer_out_cl,
            buffer_out_coords_cl,
            np.uint32(end_max_iter),
            np.uint32(start_max_iter)
        ).wait()
        # Copy the output back
        cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
        cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()

        # Get indices of "found" escapes
        done = np.where(buffer_out[:to_do]<end_max_iter)[0]
        # and write the iterations to the coresponding cell
        index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
        tmp = index_reshaped[done]
        iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]        
        #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]        

        # Get the indices of non escapes
        undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
        # and write them back to our "job" maps for the next loop
        tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
        coord_map[:undone.size*2] = tmp[undone].flatten()
        index_map[:undone.size*2] = index_reshaped[undone].flatten()

        to_do = undone.size
        start_max_iter = end_max_iter
        print "%i done. %i unknown" % (done.size, undone.size)                            

    # simple coloring by modulo 255 on the iter_map
    return (iter_map % 255).astype(np.uint8).reshape((height, width))


if __name__ == '__main__':
    ctx = cl.create_some_context(interactive=True)
    img = mandel(ctx,
          x=Decimal("-0.7546546453361122021732941811"),
          y=Decimal("0.05020518634419688663435986387"),
          zoom=Decimal("0.0002046859427855630601247281079"),
          max_iter=2000,
          iter_steps=1,
          width=500,
          height=400,
          use_double=False
    )
    Image.fromarray(img).show()

edit: Here is another version where the real/imaginary parts never leave the GPU memory.
The results are the same.
I am completely out of ideas.

SleepProgger
  • 364
  • 6
  • 20
  • 1
    The third image, that you say is correct, does not look right to me either. It lacks the detail you would expect (and the aspect scaling is incorrect). Was it done with 32-bit floating point (inadequate when 2000 iterations)? Is there a clue in *plus some few "wrong" pixel* under the first image? Why are there wrong pixels? – Weather Vane Apr 09 '17 at 16:03
  • Thank you @WeatherVane. The aspect ratio is in deed wrong, that shouldn't matter for this test tho.I fixed the zoom calculation (still wrong, but better). Float image: http://i.imgur.com/2rShDVl.png , double image: http://i.imgur.com/SvwmhWg.png . Same(ish point with another renderer http://guciek.github.io/web_mandelbrot.html#-0.7546546453361122;0.050205186344196885;0.000204685942785563;2000 (yes my rendering is flipped vertically). I will try to give you a better image of what wrong pixels are calculated in a bit. – SleepProgger Apr 09 '17 at 18:47
  • But IMHO the real problem is that i just take the temporary x,y (real/imaginary) from the kernel and supply them again if the point didn't escape (at least that is what i try to do). So i should get the same image as if i calculated it all in one run, or am i wrong here ? – SleepProgger Apr 09 '17 at 18:51
  • Agreed that the aspect ratio and image flipping are not relevant here. I don't use Python, but where is `const uint start_iter` initialised, or set to resume from the previous partial iteration? This all seems a very strange way to farm out the calculations for parallel processing. Do you detect if each point has already escaped before passing it to the next `chunk`? I would have thought it would be more efficient to pass a whole row to each thread, whatever, than to resume a partial iteration. When any thread is complete, you can give it the next row. – Weather Vane Apr 09 '17 at 19:07
  • @WeatherVane i set/update start_iter in line 70 & 107. Yes, i do detect the escaped points and only continue with the non escapes. My idea is to minimize early escape threads waiting for others which haven't escaped (when using workgroup sizes > 1 as far as i understand). The goal is to reduce the time it needs to calculate high iteration points (2 million and up). I might be totally wrong here, but can't compare the performance because i don't get it to run properly ,) – SleepProgger Apr 09 '17 at 19:38
  • Those two are `start_max_iter` not `start_iter`. Whatever is `start_max_iter = 0` for? What about the crucial `start_iter`? In `for(iter=start_iter; iter – Weather Vane Apr 09 '17 at 19:48
  • `start_iter` is the opencl side of things, `start_max_iter` is the python side. The current `start_max_iter` is passed to the OCL kernel as parameter (`start_iter`) in each (python) loop (row 80-85). Maybe my naming isn't the best there. – SleepProgger Apr 09 '17 at 19:53
  • In your code `start_iter` is never set. The other limit stuff seem to involve a lot of fumbling. End of. – Weather Vane Apr 09 '17 at 19:54
  • Again, start_iter is an argument for the opencl kernel. It is provided from the python side when calling the kernel. *Thanks for trying to help tho* – SleepProgger Apr 09 '17 at 19:55

1 Answers1

2

You are using the updated "co-ords" from buffer_out_coords instead of the original coords as the c value when doing the Z squared plus c calculation. The buffer_out_coords contains current Z values rather than the original c co-ord, so these are the values you want to begin from but you also want the original co-ords.

Theres only 4 changes you need:

  • make buffer_out_coords_cl READ_WRITE
  • copy buffer_out_coords into buffer_out_coords_cl before each run
  • filter buffer_out_coords and coord_map by "undone"
  • in the opencl code, load the start x and y from output_coord rather than coords

Im not getting the same output as you were with the code you presented so Im not sure if there is something else wrong here or not but this change gave me consistent output:

1 chunk = 153052 unknown

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 500 for 200000 numbers
46948 done. 153052 unknown

5 chunks = 153052 unknown

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 100 for 200000 numbers
0 done. 200000 unknown
Iterations from iteration 100 to 200 for 200000 numbers
11181 done. 188819 unknown
Iterations from iteration 200 to 300 for 188819 numbers
9627 done. 179192 unknown
Iterations from iteration 300 to 400 for 179192 numbers
16878 done. 162314 unknown
Iterations from iteration 400 to 500 for 162314 numbers
9262 done. 153052 unknown

Heres the code:

import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
    mf = cl.mem_flags
    cl_queue = cl.CommandQueue(ctx)
    # build program
    code = """
    #if real_t == double
        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #endif
    kernel void mandel(
        __global real_t *coords,
        __global uint *output,
        __global real_t *output_coord,
        const uint max_iter,
        const uint start_iter    
    ){
        uint id = get_global_id(0);         
        real_t2 my_coords = vload2(id, coords);           
        real_t2 my_value_coords = vload2(id, output_coord);           
        real_t x = my_value_coords.x;
        real_t y = my_value_coords.y;
        uint iter = 0;
        for(iter=start_iter; iter<max_iter; ++iter){
            if(x*x + y*y > 4.0f){
                break;
            }
            real_t xtemp = x*x - y*y + my_coords.x;
            y = 2*x*y + my_coords.y;
            x = xtemp;
        }
        // copy the current x,y pair back
        real_t2 val = (real_t2){x, y};
        vstore2(val, id, output_coord);
        output[id] = iter;
    }        
    """
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))

    # Calculate the "viewport".
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.))

    # Create index map in x,y pairs
    xx = np.arange(0, width, 1, dtype=np.uint32)
    yy = np.arange(0, height, 1, dtype=np.uint32)
    index_map = np.dstack(np.meshgrid(xx, yy))
    # and local "coordinates" (real, imaginary parts)
    coord_map = np.ndarray(index_map.shape, dtype=_nptype)
    coord_map[:] = index_map
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
    coord_map[:] += (_nptype(x0), _nptype(y0))
    coord_map = coord_map.flatten()
    index_map = index_map.flatten().astype(dtype=np.uint32)
    # Create input and output buffer
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
    buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes)
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))

    start_max_iter = 0
    to_do = coord_map.size / 2
    steps_size = int(max_iter / float(iter_steps))
    while to_do > 0 and start_max_iter < max_iter:
        end_max_iter = min(max_iter, start_max_iter + steps_size )
        print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)

        # copy x/y pairs to device 
        cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()        
        cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()        
        # and finally call the ocl function
        prg.mandel(cl_queue, (to_do,), None,
            buffer_in_cl,                   
            buffer_out_cl,
            buffer_out_coords_cl,
            np.uint32(end_max_iter),
            np.uint32(start_max_iter)
        ).wait()
        # Copy the output back
        cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
        cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()

        # Get indices of "found" escapes
        done = np.where(buffer_out[:to_do]<end_max_iter)[0]
        # and write the iterations to the coresponding cell
        index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
        tmp = index_reshaped[done]
        iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]        
        #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]        

        # Get the indices of non escapes
        undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
        # and write them back to our "job" maps for the next loop
        tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
        buffer_out_coords[:undone.size*2] = tmp[undone].flatten()
        tmp = coord_map[:to_do*2].reshape((to_do, 2))
        coord_map[:undone.size*2] = tmp[undone].flatten()
        index_map[:undone.size*2] = index_reshaped[undone].flatten()

        to_do = undone.size
        start_max_iter = end_max_iter
        print "%i done. %i unknown" % (done.size, undone.size)                            

    # simple coloring by modulo 255 on the iter_map
    return (iter_map % 255).astype(np.uint8).reshape((height, width))


if __name__ == '__main__':
    ctx = cl.create_some_context(interactive=True)
    img = mandel(ctx,
          x=Decimal("-0.7546546453361122021732941811"),
          y=Decimal("0.05020518634419688663435986387"),
          zoom=Decimal("0.0002046859427855630601247281079"),
          max_iter=2000,
          iter_steps=1,
          width=500,
          height=400,
          use_double=False
    )
    Image.fromarray(img).show()
spacepickle
  • 2,678
  • 16
  • 21
  • Oh damn. Thank you. Didn't test it but as soon as i started reading your answer i knew i was stupid. I searched so unbelievable long for the error. Thanks again. Will accept as soon as i tested it but i am pretty sure thats the error. – SleepProgger Apr 15 '17 at 08:44