I'm rewriting my old rendering pipeline. I created a very lean prototype of what I'd like, and I'm stunned that my old fairly complex and badly optimized pipeline has the exact same performance as the super simple prototype.
Task is rendering 1024 arbitrary sized meshes (14 million triangles in total) with a different set of uniforms per mesh.
What I now do is using uniform buffers + glMultiDrawElementsIndirect
and index into the uniform buffer with gl_DrawIDARB
. This is the render loop:
function renderloop(window, N, frame_times, program, commandbuff)
glUseProgram(program)
glEnable(GL_DEPTH_TEST)
glClearColor(1, 1, 1, 1)
GLAbstraction.bind(commandbuff)
n = 0
while isopen(window) && n <= N
tic()
glFinish() # make sure we time the right thing
GLWindow.poll_glfw()
#glBindVertexArray(vbo.id) doesn't change timing much
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
glMultiDrawElementsIndirect(
GL_TRIANGLES,
GL_UNSIGNED_INT,
C_NULL, length(commandbuff), 0
)
#glBindVertexArray(0)
GLWindow.swapbuffers(window)
push!(frame_times, toq())
n += 1
end
frame_times
end
My other pipeline is too complex to write down here, but in short it's unoptimized Julia code, GLSL 3.0 drawing code with uniforms + ray picking + fxaa + a couple of render targets and so on. Shaders are pretty much the same, besides the modernization with uniform blocks etc.
The new (almost) complete code can be seen here:
vert = """
#version 450
#extension GL_ARB_shader_draw_parameters : enable
struct VertexArgument{
vec4 color;
mat4 model;
};
layout (location = 0) in vec3 position;
layout (location = 1) in vec3 normal;
layout (std140) uniform Scene{
vec4 lightposition;
mat4 proj;
mat4 view;
mat4 projview;
vec2 resolution;
} scene;
layout (std140) uniform VertexArguments{
VertexArgument[1024] args;
} vertex_arguments;
out VertexOut{
vec3 vertex;
vec3 normal;
vec3 lightdir;
vec4 color;
} vertex_out;
void main(){
VertexArgument arg = vertex_arguments.args[gl_DrawIDARB];
vec4 position_camspace = scene.view * arg.model * vec4(position, 1.0);
gl_Position = scene.proj * position_camspace;
vertex_out.lightdir = normalize(vec3(-10) - position.xyz);
vertex_out.vertex = -position_camspace.xyz;
vertex_out.normal = normal;
vertex_out.color = arg.color;
}
"""
frag = """
#version 450
vec3 blinnphong(vec3 V, vec3 N, vec3 L, vec3 color){
float diff_coeff = max(dot(L,N), 0.0);
// specular coefficient
vec3 H = normalize(L+V);
float spec_coeff = pow(max(dot(H,N), 0.0), 8.0);
if (diff_coeff <= 0.0)
spec_coeff = 0.0;
// final lighting model
return vec3(
vec3(0.1) * vec3(0.3) +
vec3(0.9) * color * diff_coeff +
vec3(0.3) * spec_coeff
);
}
in VertexOut{
vec3 vertex;
vec3 normal;
vec3 lightdir;
vec4 color;
} vertex_in;
layout (location = 0) out vec4 frag_color;
void main(){
vec3 L = normalize(vertex_in.lightdir);
vec3 N = normalize(vertex_in.normal);
vec3 light1 = blinnphong(vertex_in.vertex, N, L, vertex_in.color.rgb);
vec3 light2 = blinnphong(vertex_in.vertex, N, -L, vertex_in.color.rgb);
frag_color = vec4(light1 + light2, 1.0);
}
"""
window = create_glcontext(
major = 4, minor = 5, debugging = false,
windowhints = [
(GLFW.SAMPLES, 0),
(GLFW.DEPTH_BITS, 32),
(GLFW.ALPHA_BITS, 8),
(GLFW.RED_BITS, 8),
(GLFW.GREEN_BITS, 8),
(GLFW.BLUE_BITS, 8),
(GLFW.STENCIL_BITS, 0),
(GLFW.AUX_BUFFERS, 0)
]
)
events = WindowEvents(Window => window)
cam = PerspectiveCamera(
TranslationSpeed => 1f0,
LookAt => Vec3f0(0),
EyePosition => Vec3f0(6, 6, 8),
Rotation => Vec3f0(0),
Area => events[Area],
RotationSpeed => 0.1f0
)
vertshader = compile_shader(Vector{UInt8}(vert), GL_VERTEX_SHADER, :vertexshader)
fragshader = compile_shader(Vector{UInt8}(frag), GL_FRAGMENT_SHADER, :fragshader)
program = compile_program(vertshader, fragshader)
scene = (
Vec4f0(10),
cam[Projection],
cam[View],
cam[ProjectionView],
Vec2f0(widths(cam[Area]))
)
scene_buff = UniformBuffer(scene) # create UniformBuffer GL_STATIC_DRAW
FieldTraits.on(cam, ProjectionView) do projview
# write new values to scene buffer.. if not doing this, timings stay the same
scene_buff[1] = (
Vec4f0(10),
cam[Projection],
cam[View],
projview,
Vec2f0(widths(cam[Area]))
)
end
vals = (Vec4f0(1, 0, 0, 1), eye(Mat4f0))
uniform_array = UniformBuffer(typeof(vals))
function loadmeshes(folder)
# load 1024 meshes
meshpaths = filter(x-> endswith(x, ".ifs"), readdir(folder))[1:1024]
faces = GLTriangle[]
vertices = Tuple{Point3f0, Normal{3, Float32}}[]
fidx = 0; vidx = 0;
drawcommands = Vector{Command}(length(meshpaths))
for (i, meshpath) in enumerate(meshpaths)
mesh = read_ifs(joinpath(folder, meshpath))
fs, vs = mesh.indexes[1], mesh.parent
append!(faces, fs)
ns = normals(vs, fs)
append!(vertices, zip(vs, ns))
mini, maxi = extrema(mesh.parent)
x, y = ind2sub((32, 32), i)
trans = translationmatrix(Vec3f0(x, y, 0f0))
s = maximum(maxi .- mini)
scale = scalematrix(Vec3f0(1f0 ./ s))
# add uniform attributes to buffer
push!(uniform_array, (
Vec4f0(rand(Vec3f0)..., 1f0),
trans * scale * translationmatrix(-Vec3f0(mini))
))
drawcommands[i] = Command(length(fs) * 3, 1, fidx, vidx, 0)
fidx += length(fs) * 3; vidx += length(vs)
end
vbo = VertexArray(view(vertices, faces)) # vertexarray
ibuff = GLBuffer(drawcommands, buffertype = GL_DRAW_INDIRECT_BUFFER)
vbo, ibuff
end
vbo, commandbuff = loadmeshes(homedir() * "/3dstuff/models")
sceneidx = glGetUniformBlockIndex(program, "Scene")
vertex_arts_idx = glGetUniformBlockIndex(program, "VertexArguments")
glUniformBlockBinding(program, sceneidx, 0)
glUniformBlockBinding(program, vertex_arts_idx, 1)
glBindBufferBase(GL_UNIFORM_BUFFER, 0, scene_buff.buffer.id)
glBindBufferBase(GL_UNIFORM_BUFFER, 1, uniform_array.buffer.id)
function renderloop(window, N, frame_times, commandbuff)
glUseProgram(program)
glEnable(GL_DEPTH_TEST)
glClearColor(1, 1, 1, 1)
GLAbstraction.bind(commandbuff)
n = 0
while isopen(window) && n <= N
tic()
glFinish() # make sure we time the real thing
GLWindow.poll_glfw()
#glBindVertexArray(vbo.id) doesn't change timing much
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
glMultiDrawElementsIndirect(
GL_TRIANGLES,
GL_UNSIGNED_INT,
C_NULL, length(commandbuff), 0
)
#glBindVertexArray(0)
GLWindow.swapbuffers(window)
push!(frame_times, toq())
n += 1
end
frame_times
end
times = Float64[]
renderloop(window, 2000, times, commandbuff)
mean(times) * 1000 # ~ 14 ms
GPU is a FirePro 9100.
Timings of old pipeline: ~13ms per frame. New prototype: ~15ms and 0.2ms without the glMultiDrawElementsIndirect call.
I also tried turning vsync on and off and moved the code around a little, with no difference in timing whatsoever. The new prototype feels less smooth as well, so seems like it's not just a measuring problem.