3

I have done class which render 2d objects based on Dear ImGui DrawList, because it can draw many different variants of objects thanks index vector dynamic array and still stay well optimized. Dear ImGui can render 30k unfilled rects while having ~36fps and ~70MB on debug mode, without antialiasing (my computer). Mine very limited version draws 30k unfilled rects while having ~3 fps and ~130MB on debug mode.

class Renderer
{
public:
    Renderer();
    ~Renderer();

    void Create();

    void DrawRect(float x, float y, float w, float h, GLuint color, float thickness);

    void Render(float w, float h);

    void Clear();

    void ReserveData(int numVertices, int numElements);

    void CreatePolygon(const Vector2* vertices, const GLuint verticesCount, GLuint color, float thickness);

    GLuint vao, vbo, ebo;
    GLShader shader;

    Vertex* mappedVertex = nullptr;     
    GLuint* mappedElement = nullptr,   
            currentVertexIndex = 0;

    std::vector<Vertex> vertexBuffer;  
    std::vector<GLuint> elementBuffer; 
    std::vector<Vector2> vertices;     

};

const char* vtx =
R"(

#version 460 core

layout(location = 0) in vec3 a_position;
layout(location = 1) in vec4 a_color;

out vec3 v_position;
out vec4 v_color;

uniform mat4 projection;

void main()
{
    gl_Position = projection * vec4(a_position, 1.0);

    v_color = a_color;
}

)";

const char* frag =
R"(
#version 460 core

layout (location = 0) out vec4 outColor;

in vec4 v_color;

void main()
{
    outColor = v_color;
}
)";

void Renderer::Clear()
{
    vertexBuffer.resize(0);
    elementBuffer.resize(0);
    vertices.resize(0);
    mappedVertex = nullptr;
    mappedElement = nullptr;
    currentVertexIndex = 0;
}

void Renderer::Create()
{
    glGenBuffers(1, &vbo);
    glGenBuffers(1, &ebo);

   shader.VtxFromFile(vtx);
   shader.FragFromFile(frag);
}

void Renderer::DrawRect(float x, float y, float w, float h, GLuint color,     float thickness)
{
    // Add vertices
    vertices.push_back({ x, y });
    vertices.push_back(Vector2(x, y + w));
    vertices.push_back(Vector2( x, y ) + Vector2(w, h));
    vertices.push_back(Vector2(x + w, y));
    // Create rect
    CreatePolygon(vertices.data(), vertices.size(), color, thickness);
}

void Renderer::Render(float w, float h)
{
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);

    shader.UseProgram();
    shader.UniformMatrix4fv("projection", glm::ortho(0.0f, w, 0.0f, h));

    GLuint elemCount = elementBuffer.size();

    glGenVertexArrays(1, &vao);
    glBindVertexArray(vao);

    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    glEnableVertexAttribArray(0);
    glEnableVertexAttribArray(1);

    glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, sizeof(Vertex), (const void*)offsetof(Vertex, position));
    glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(Vertex), (const void*)offsetof(Vertex, position));

    glBufferData(GL_ARRAY_BUFFER, vertexBuffer.size() * sizeof(Vertex), vertexBuffer.data(), GL_STREAM_DRAW);

    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo);
    glBufferData(GL_ELEMENT_ARRAY_BUFFER, elementBuffer.size() * sizeof(GLuint), elementBuffer.data(), GL_STREAM_DRAW);

    const unsigned short* idxBufferOffset = 0;

    glDrawElements(GL_TRIANGLES, elemCount, GL_UNSIGNED_INT, idxBufferOffset);

    idxBufferOffset += elemCount;

    glDeleteVertexArrays(1, &vao);

    glDisable(GL_BLEND);
}

void Renderer::CreatePolygon(const Vector2* vertices, const GLuint     verticesCount, GLuint color, float thickness)
{
    // To create for example unfilled rect, we have to draw 4 rects with small sizes
    // So, unfilled rect is built from 4 rects and each rect contains 4 vertices ( * 4) and 6 indices ( *6)
    ReserveData(verticesCount * 4, verticesCount * 6);

    for (GLuint i = 0; i < verticesCount; ++i)
    {
        const int j = (i + 1) == verticesCount ? 0 : i + 1;

        const Vector2& position1 = vertices[i];
        const Vector2& position2 = vertices[j];

        Vector2 difference = position2 - position1;

        difference *= difference.Magnitude() > 0 ? 1.0f / difference.Magnitude() : 1.0f;

        const float dx = difference.x * (thickness * 0.5f);
        const float dy = difference.y * (thickness * 0.5f);

        mappedVertex[0].position = Vector2(position1.x + dy, position1.y - dx);
        mappedVertex[1].position = Vector2(position2.x + dy, position2.y - dx);
        mappedVertex[2].position = Vector2(position2.x - dy, position2.y + dx);
        mappedVertex[3].position = Vector2(position1.x - dy, position1.y + dx);

        mappedVertex[0].color = color;
        mappedVertex[1].color = color;
        mappedVertex[2].color = color;
        mappedVertex[3].color = color;

        mappedVertex += 4;

        mappedElement[0] = currentVertexIndex;
        mappedElement[1] = currentVertexIndex + 1;
        mappedElement[2] = currentVertexIndex + 2;
        mappedElement[3] = currentVertexIndex + 2;
        mappedElement[4] = currentVertexIndex + 3;
        mappedElement[5] = currentVertexIndex;

        mappedElement += 6;
        currentVertexIndex += 4;
    }

    this->vertices.clear();
}

void Renderer::ReserveData(int numVertices, int numElements)
{
    currentVertexIndex = vertexBuffer.size();

    // Map vertex buffer
    int oldVertexSize = vertexBuffer.size();
    vertexBuffer.resize(oldVertexSize + numVertices);
    mappedVertex = vertexBuffer.data() + oldVertexSize;

    // Map element buffer
    int oldIndexSize = elementBuffer.size();
    elementBuffer.resize(oldIndexSize + numElements);
    mappedElement = elementBuffer.data() + oldIndexSize;
}


int main()
{
    //Create window, init opengl, etc.
    Renderer renderer;
    renderer.Create();
    bool quit=false;
    while(!quit) {
        //Events
        //Clear color bit

        renderer.Clear();

        for(int i = 0; i < 30000; ++i)
            renderer.DrawRect(100.0f, 100.0f, 50.0f, 50.0f, 0xffff0000, 1.5f);

        renderer.Render(windowW, windowH);        

        //swap buffers
    }
    return 0;
}

Why is it that much slower? How can I make it faster and less memory-consuming?

Shout
  • 338
  • 5
  • 14
  • The whole data handling looks suboptimal. Given that whenever resize is called, the whole vector content might be copied, it doesn't seem a good idea to do that for every vertex. – BDL Aug 03 '18 at 11:37
  • I'm also not so sure that drawing four quads for an unfilled rect is a good idea. (It definitely isn't memory wise). Why not draw the full rect and discard fragments in the open area in the fragment shader? Also, sending just one vec4 containing x, y, width, height to the vertex shader and creating the quad in a geometry shader might be an option. – BDL Aug 03 '18 at 11:44
  • I meant: Manually resizing vectors and then accessing them through raw points is not a good idea. Use push_back directly instead. – BDL Aug 03 '18 at 11:45
  • `The whole data handling looks suboptimal. Given that whenever resize is called, the whole vector content might be copied, it doesn't seem a good idea to do that for every vertex.` I do not exactly understand, could you explain it differently? 2. It is another draw call and another shader. This renderer is not only for unfilled rects. It is also for filled, circles, etc. etc.. – Shout Aug 03 '18 at 11:47
  • So, I've just changed it to push back (I tried it before, because making games with ben made video about debug renderer and he pushed back instead of accessing through raw pointers) and it makes it worse. 1fps sometimes 2, ~2 seconds white screen waiting to load a data. Memory dropped down to 90, but final result... – Shout Aug 03 '18 at 11:58
  • In debug or in release mode? Vectors are terribly slow in debug due to memory checking. Profiling should always be done in Release. (which might also be a reason why yours is slower in debug than imgui. They are afaik using their own vector implementation) – BDL Aug 03 '18 at 12:07
  • `which might also be a reason why yours is slower in debug than imgui. They are afaik using their own vector implementation` - hmm.. that could be the case; `In debug or in release mode? ` - in release mode it is 150-170fps vs 300 fps imgui. Maybe I should try it with its vector implementation. – Shout Aug 03 '18 at 12:16
  • From looking at their source code, it seams that they are drawing non-filled rectangles as line strips. But I could be wrong. – BDL Aug 03 '18 at 12:17
  • Nope. Check `imgui_impl_opengl3.cpp` - file; `void ImGui_ImplOpenGL3_RenderDrawData(ImDrawData* draw_data)`- method. Prim is gl_triangle and there are no any more draw calls. – Shout Aug 03 '18 at 12:18
  • So, I copied, pasted and run app with imgui vector and in debug mode it is faster, not that much like in original project, but release mode stays the same if accesing through raw pointers (~150fps) and drops down while pushing back elements to ~116 fps, but it is still far from its performence (~300). It has to be something else – Shout Aug 03 '18 at 12:46
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/177371/discussion-between-scouteeer-and-bdl). – Shout Aug 03 '18 at 17:23
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/177421/discussion-between-shout-and-bdl). – Shout Aug 04 '18 at 16:26

2 Answers2

0

The biggest bottleneck in that code looks like your allocations are never amortized across frames, since you are clearing the buffers capacity instead of reusing them, leading you to lots of realloc/copies (probably Log2(n) reallocs/copies if your vector implementation grows by factor of 2). Try changing your .clear() call with .resize(0) and maybe you can have a more lazy/rare call to .clear() when things gets unused.

In debug or in release mode? Vectors are terribly slow in debug due to memory checking. Profiling should always be done in Release.

Profiling should be done both in Release and Debug/Unoptimized mode if you intend to ever use and work with your application in Debug/Unoptimized mode. The gross "zero-cost abstraction" lie of modern C++ is that it makes it a pain to work with a debugger because large applications don't run at correct frame-rate in "Debug" mode any more. Ideally you should always run all your applications in Debug mode. Do yourself a productivity favour and ALSO do some profiling/optimization for your worse case.

Good luck with your learning quest! :)

Omar
  • 627
  • 4
  • 6
  • Only destructor calls `.clear()` (so only at the end the application at the moment) and at the end of `CreatePolygon(...)` method. I've just changed `vertices.clear()` to `vertices.resize(0)` in that method. So there are only .resize() things and performance is still the same . `Do yourself a productivity favour and ALSO do some profiling/optimization for your worse case` I'm going to do it right now. – Shout Aug 03 '18 at 15:06
  • @Omar `std::vector::clear` does NOT decrease capacity, which makes it no different from `.resize(0)` in this case. – HolyBlackCat Aug 03 '18 at 15:26
  • The biggest bottleneck after using standard vector was setting position like this: `mappedVertex[0].position = Vector2(...)`. Instead, I should do it like so: `mappedVertex[0].position.x = ; mappedVertex[0].position.y =` and so on... Fps grew up from ~16 to ~32. Thanks for writing back and big help and explanation! – Shout Aug 03 '18 at 15:31
  • Do you know guys how can I mark this post as solved? – Shout Aug 03 '18 at 15:32
  • 2
    @Scouteeer Write your own answer, explaining what did you do, and press the green tick next to it. – HolyBlackCat Aug 03 '18 at 16:27
  • @HolyBlackCat My bad! Haven't been using STL recently so I mixed up some of those guarantee! I thought that was the difference between clear() and resize(0) in STL world tho. Scouteer In that case my comment about reallocating/copying is void, apologies, but maybe double check that your subsequent frames don't do any alloc/copy if you want. – Omar Aug 03 '18 at 19:26
  • I know it's not related to the post, but I want to build GUI applications in c++? Which framework should i use (free and easy to setup)? – rog Jul 07 '21 at 13:48
0

Solution

  1. I do not use std::vector anymore. I use ImVector instead (it maybe your own implementation as well),
  2. I set position directly to a Vector2.x/.y
Shout
  • 338
  • 5
  • 14
  • 1
    It would be interesting to compare your performances in fully optimized "Release" build. Much of the small optimization done on ImDrawList are affectively to avoid overhead in non-optimized builds. – Omar Aug 03 '18 at 19:27
  • @Omar I have not optimized it fully yet, I have got some vector class problems. But in release mode: `Maximum Optimization (Favor Speed) (/O2)`, `Any Suitable (/Ob2)`, `Yes (/Oi)`, `Favor fast code (/Ot)`, `No (/Oy-)`, `No`, `Yes (/GL)` - I have got 200 fps while rendering 30k unfilled rects without antialiasing at the moment. While yours has ~300 on my pc. I have to find this fps killer and remove it from my code. If you want, when it will be done, I can write you back somewhere. – Shout Aug 03 '18 at 22:56