3

I'm rewriting my old rendering pipeline. I created a very lean prototype of what I'd like, and I'm stunned that my old fairly complex and badly optimized pipeline has the exact same performance as the super simple prototype.

Task is rendering 1024 arbitrary sized meshes (14 million triangles in total) with a different set of uniforms per mesh.

What I now do is using uniform buffers + glMultiDrawElementsIndirect and index into the uniform buffer with gl_DrawIDARB. This is the render loop:

function renderloop(window, N, frame_times, program, commandbuff)
    glUseProgram(program)
    glEnable(GL_DEPTH_TEST)
    glClearColor(1, 1, 1, 1)
    GLAbstraction.bind(commandbuff)
    n = 0
    while isopen(window) && n <= N
        tic()
        glFinish() # make sure we time the right thing
        GLWindow.poll_glfw()
        #glBindVertexArray(vbo.id) doesn't change timing much
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
        glMultiDrawElementsIndirect(
            GL_TRIANGLES,
            GL_UNSIGNED_INT,
            C_NULL, length(commandbuff), 0
        )
        #glBindVertexArray(0)
        GLWindow.swapbuffers(window)
        push!(frame_times, toq())
        n += 1
    end
    frame_times
end

My other pipeline is too complex to write down here, but in short it's unoptimized Julia code, GLSL 3.0 drawing code with uniforms + ray picking + fxaa + a couple of render targets and so on. Shaders are pretty much the same, besides the modernization with uniform blocks etc.

The new (almost) complete code can be seen here:

vert = """
#version 450
#extension GL_ARB_shader_draw_parameters : enable

struct VertexArgument{
    vec4 color;
    mat4 model;
};

layout (location = 0) in vec3 position;
layout (location = 1) in vec3 normal;

layout (std140) uniform Scene{
    vec4 lightposition;
    mat4 proj;
    mat4 view;
    mat4 projview;
    vec2 resolution;
} scene;

layout (std140) uniform VertexArguments{
    VertexArgument[1024] args;
} vertex_arguments;


out VertexOut{
    vec3 vertex;
    vec3 normal;
    vec3 lightdir;
    vec4 color;
} vertex_out;

void main(){
    VertexArgument arg = vertex_arguments.args[gl_DrawIDARB];
    vec4 position_camspace = scene.view * arg.model * vec4(position,  1.0);
    gl_Position = scene.proj * position_camspace;
    vertex_out.lightdir = normalize(vec3(-10) - position.xyz);
    vertex_out.vertex = -position_camspace.xyz;
    vertex_out.normal = normal;
    vertex_out.color = arg.color;
}
"""

frag = """
#version 450

vec3 blinnphong(vec3 V, vec3 N, vec3 L, vec3 color){

    float diff_coeff = max(dot(L,N), 0.0);

    // specular coefficient
    vec3 H = normalize(L+V);

    float spec_coeff = pow(max(dot(H,N), 0.0), 8.0);
    if (diff_coeff <= 0.0)
        spec_coeff = 0.0;

    // final lighting model
    return vec3(
        vec3(0.1) * vec3(0.3)  +
        vec3(0.9) * color * diff_coeff +
        vec3(0.3) * spec_coeff
    );
}

in VertexOut{
    vec3 vertex;
    vec3 normal;
    vec3 lightdir;
    vec4 color;
} vertex_in;

layout (location = 0) out vec4 frag_color;

void main(){
    vec3 L = normalize(vertex_in.lightdir);
    vec3 N = normalize(vertex_in.normal);
    vec3 light1 = blinnphong(vertex_in.vertex, N, L, vertex_in.color.rgb);
    vec3 light2 = blinnphong(vertex_in.vertex, N, -L, vertex_in.color.rgb);
    frag_color = vec4(light1 + light2, 1.0);
}
"""

window = create_glcontext(
    major = 4, minor = 5, debugging = false,
    windowhints = [
        (GLFW.SAMPLES,      0),
        (GLFW.DEPTH_BITS,   32),

        (GLFW.ALPHA_BITS,   8),
        (GLFW.RED_BITS,     8),
        (GLFW.GREEN_BITS,   8),
        (GLFW.BLUE_BITS,    8),

        (GLFW.STENCIL_BITS, 0),
        (GLFW.AUX_BUFFERS,  0)
    ]
)

events = WindowEvents(Window => window)

cam = PerspectiveCamera(
    TranslationSpeed => 1f0,
    LookAt => Vec3f0(0),
    EyePosition => Vec3f0(6, 6, 8),
    Rotation => Vec3f0(0),
    Area => events[Area],
    RotationSpeed => 0.1f0
)

vertshader = compile_shader(Vector{UInt8}(vert), GL_VERTEX_SHADER, :vertexshader)
fragshader = compile_shader(Vector{UInt8}(frag), GL_FRAGMENT_SHADER, :fragshader)

program = compile_program(vertshader, fragshader)

scene = (
    Vec4f0(10),
    cam[Projection],
    cam[View],
    cam[ProjectionView],
    Vec2f0(widths(cam[Area]))
)

scene_buff = UniformBuffer(scene) # create UniformBuffer GL_STATIC_DRAW

FieldTraits.on(cam, ProjectionView) do projview
    # write new values to scene buffer.. if not doing this, timings stay the same
    scene_buff[1] = (
        Vec4f0(10),
        cam[Projection],
        cam[View],
        projview,
        Vec2f0(widths(cam[Area]))
    )
end

vals = (Vec4f0(1, 0, 0, 1), eye(Mat4f0))
uniform_array = UniformBuffer(typeof(vals))

function loadmeshes(folder)
  # load 1024 meshes
    meshpaths = filter(x-> endswith(x, ".ifs"), readdir(folder))[1:1024]
    faces = GLTriangle[]
    vertices = Tuple{Point3f0, Normal{3, Float32}}[]
    fidx = 0; vidx = 0;
    drawcommands = Vector{Command}(length(meshpaths))
    for (i, meshpath) in enumerate(meshpaths)
        mesh = read_ifs(joinpath(folder, meshpath))
        fs, vs = mesh.indexes[1], mesh.parent
        append!(faces, fs)
        ns = normals(vs, fs)
        append!(vertices, zip(vs, ns))
        mini, maxi = extrema(mesh.parent)
        x, y = ind2sub((32, 32), i)
        trans = translationmatrix(Vec3f0(x, y, 0f0))
        s = maximum(maxi .- mini)
        scale = scalematrix(Vec3f0(1f0 ./ s))
    # add uniform attributes to buffer
        push!(uniform_array, (
            Vec4f0(rand(Vec3f0)..., 1f0),
            trans * scale * translationmatrix(-Vec3f0(mini))
        ))
        drawcommands[i] = Command(length(fs) * 3, 1, fidx, vidx, 0)
        fidx += length(fs) * 3; vidx += length(vs)
    end

    vbo = VertexArray(view(vertices, faces)) # vertexarray
    ibuff = GLBuffer(drawcommands, buffertype = GL_DRAW_INDIRECT_BUFFER)
    vbo, ibuff
end

vbo, commandbuff = loadmeshes(homedir() * "/3dstuff/models")
sceneidx = glGetUniformBlockIndex(program, "Scene")
vertex_arts_idx = glGetUniformBlockIndex(program, "VertexArguments")

glUniformBlockBinding(program, sceneidx, 0)
glUniformBlockBinding(program, vertex_arts_idx, 1)
glBindBufferBase(GL_UNIFORM_BUFFER, 0, scene_buff.buffer.id)
glBindBufferBase(GL_UNIFORM_BUFFER, 1, uniform_array.buffer.id)


function renderloop(window, N, frame_times, commandbuff)
    glUseProgram(program)
    glEnable(GL_DEPTH_TEST)
    glClearColor(1, 1, 1, 1)
    GLAbstraction.bind(commandbuff)
    n = 0
    while isopen(window) && n <= N
        tic()
        glFinish() # make sure we time the real thing
        GLWindow.poll_glfw()
        #glBindVertexArray(vbo.id) doesn't change timing much
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
        glMultiDrawElementsIndirect(
            GL_TRIANGLES,
            GL_UNSIGNED_INT,
            C_NULL, length(commandbuff), 0
        )
        #glBindVertexArray(0)
        GLWindow.swapbuffers(window)
        push!(frame_times, toq())
        n += 1
    end
    frame_times
end
times = Float64[]
renderloop(window, 2000, times, commandbuff)
mean(times) * 1000 # ~ 14 ms

GPU is a FirePro 9100.

Timings of old pipeline: ~13ms per frame. New prototype: ~15ms and 0.2ms without the glMultiDrawElementsIndirect call.

I also tried turning vsync on and off and moved the code around a little, with no difference in timing whatsoever. The new prototype feels less smooth as well, so seems like it's not just a measuring problem.

Simon Danisch
  • 554
  • 3
  • 8
  • My only explanation is, that this must be completely vertex shader bound , leaving lots of fragment units idle (duh). So having more work for the fragment shader and unoptimized Julia code feeding the GPU doesn't change anything, since the OpenGL commands get scheduled asynchronously and the vertex stage is saturated. But this can't explain everything: It still shouldn't be slower and it should be possible to get over 60 frames with such a powerful GPU and only one draw call. Also, with 2048 models the perf gap becomes even worse! So there must also be a regression in the shader!Or driver bug? – Simon Danisch Apr 28 '17 at 15:19
  • Perhaps your timing measures bus and shaders processing at once. [This](https://www.khronos.org/opengl/wiki/Performance) and [this](http://www.lighthouse3d.com/tutorials/opengl-timer-query/) may give you some ideas. – Ripi2 Apr 28 '17 at 16:17

1 Answers1

1

glMultiDrawElementsIndirect( GL_TRIANGLES, GL_UNSIGNED_INT, C_NULL, length(commandbuff), 0 )

This parameter should be how many elements you want to draw. Put 1024 here to see if it fixes the performance issue.

Alundaio
  • 551
  • 4
  • 8