1

I have a compute-heavy OpenGL vertex shader which I'm trying to profile the performance of.

Following the conventional wisdom ¹ ² ³, I'm computing the frames per second in my glfw app by waiting over 1 second and dividing the number of frames by the time elapsed. My FPS counter claims ≈30 FPS but it's clearly more like 1 FPS. Notice the grass blowing in the breeze behind the screen.

FPS counter claims ≈30 FPS but it's more like 1 FPS. Notice the grass blowing in the breeze behind the screen.

My minimal example below and in this gist, animates a densely tessellated grid and performs dummy computation in the vertex shader until the issue appears.

Is there a way to measure FPS or the performance of this shader in a way that it accurately reflects its real behavior?

// Controls how much (dummy) computation happens in the vertex shader.
const int m = 20000;

#define GL_SILENCE_DEPRECATION
#include <OpenGL/gl3.h>
#define __gl_h_


#include <Eigen/Core>
#include <Eigen/Geometry>
#define GLFW_INCLUDE_GLU
#include <GLFW/glfw3.h>

#include <chrono>
#include <string>
#include <chrono>
#include <thread>
#include <iostream>

std::string vertex_shader = R"(
#version 330 core
uniform mat4 proj;
uniform mat4 model;
uniform float t;
uniform int m;
in vec3 position;
out vec4 position_eye;
void main()
{
  vec4 deformed = 
    vec4(
      position.x,
      position.y,
      sin(t*3.14159)*
      cos(position.x*3.14159)*
      cos(position.y*3.14159)
      ,
      1.);
  for(int j = 0;j<m;j++)
  {
    deformed.z = deformed.z + 0.000001*float(j)/float(m);
  }
  position_eye = proj * model * deformed;
  gl_Position = position_eye;
}
)";
std::string fragment_shader = R"(
#version 330 core
in vec4 position_eye;
out vec3 color;
void main()
{
  vec3 xTangent = dFdx(position_eye.xyz);
  vec3 yTangent = dFdy(position_eye.xyz);
  color = normalize( cross( yTangent, xTangent ) )*0.5 + 0.5;
}
)";

// width, height, shader id, vertex array object
int w=800,h=600;
double highdpi=1;
GLuint prog_id=0;
GLuint VAO;
// Mesh data: RowMajor is important to directly use in OpenGL
Eigen::Matrix< float,Eigen::Dynamic,3,Eigen::RowMajor> V;
Eigen::Matrix<GLuint,Eigen::Dynamic,3,Eigen::RowMajor> F;
int main(int argc, char * argv[])
{
  using namespace std;
  const auto get_seconds = []()
  {
    return 
      std::chrono::duration<double>(
        std::chrono::system_clock::now().time_since_epoch()).count();
  };
  if(!glfwInit())
  {
     cerr<<"Could not initialize glfw"<<endl;
     return EXIT_FAILURE;
  }
  const auto & error = [] (int error, const char* description)
  {
    cerr<<description<<endl;
  };
  glfwSetErrorCallback(error);
  glfwWindowHint(GLFW_SAMPLES, 4);
  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
  glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
  glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
  GLFWwindow* window = glfwCreateWindow(w, h, "WebGL", NULL, NULL);
  if(!window)
  {
    glfwTerminate();
    cerr<<"Could not create glfw window"<<endl;
    return EXIT_FAILURE;
  }

  glfwMakeContextCurrent(window);

      int major, minor, rev;
      major = glfwGetWindowAttrib(window, GLFW_CONTEXT_VERSION_MAJOR);
      minor = glfwGetWindowAttrib(window, GLFW_CONTEXT_VERSION_MINOR);
      rev = glfwGetWindowAttrib(window, GLFW_CONTEXT_REVISION);
      printf("OpenGL version recieved: %d.%d.%d\n", major, minor, rev);
      printf("Supported OpenGL is %s\n", (const char*)glGetString(GL_VERSION));
      printf("Supported GLSL is %s\n", (const char*)glGetString(GL_SHADING_LANGUAGE_VERSION));

  glfwSetInputMode(window,GLFW_CURSOR,GLFW_CURSOR_NORMAL);
  const auto & reshape = [] (GLFWwindow* window, int w, int h)
  {
    ::w=w,::h=h;
  };
  glfwSetWindowSizeCallback(window,reshape);

  {
    int width, height;
    glfwGetFramebufferSize(window, &width, &height);
    int width_window, height_window;
    glfwGetWindowSize(window, &width_window, &height_window);
    highdpi = width/width_window;
    reshape(window,width_window,height_window);
  }


  // Compile each shader
  const auto & compile_shader = [](const GLint type,const char * str) -> GLuint
  {
    GLuint id = glCreateShader(type);
    glShaderSource(id,1,&str,NULL);
    glCompileShader(id);
    return id;
  };
  GLuint vid = compile_shader(GL_VERTEX_SHADER,vertex_shader.c_str());
  GLuint fid = compile_shader(GL_FRAGMENT_SHADER,fragment_shader.c_str());
  // attach shaders and link
  prog_id = glCreateProgram();
  glAttachShader(prog_id,vid);
  glAttachShader(prog_id,fid);
  glLinkProgram(prog_id);
  GLint status;
  glGetProgramiv(prog_id, GL_LINK_STATUS, &status);
  glDeleteShader(vid);
  glDeleteShader(fid);



  // construct a regular grid mesh
  const int nx = 300;
  const int ny = 305;
  V.resize(nx*ny,3);
  for(int i = 0;i<nx;i++)
  {
    for(int j = 0;j<ny;j++)
    {
      const float x = float(i)/(nx-1);
      const float y = float(j)/(ny-1);
      V.row(j*nx+i) << x,y, 0;
    }
  }
  F.resize((nx-1)*(ny-1)*2,3);
  for(int y = 0;y<ny-1;y++)
  {
    for(int x = 0;x<nx-1;x++)
    {
      // index of southwest corner
      const int sw = (x  +nx*(y+0));
      const int se = (x+1+nx*(y+0));
      const int ne = (x+1+nx*(y+1));
      const int nw = (x  +nx*(y+1));
      // Index of first triangle in this square
      const int gf = 2*(x+(nx-1)*y);
      F(gf+0,0) = sw;
      F(gf+0,1) = se;
      F(gf+0,2) = nw;
      F(gf+1,0) = se;
      F(gf+1,1) = ne;
      F(gf+1,2) = nw;
    }
  }


  V.rowwise() -= V.colwise().mean();
  V /= (V.colwise().maxCoeff()-V.colwise().minCoeff()).maxCoeff();
  V /= 1.2;

  // Generate and attach buffers to vertex array
  glGenVertexArrays(1, &VAO);
  GLuint VBO, EBO;
  glGenBuffers(1, &VBO);
  glGenBuffers(1, &EBO);
  glBindVertexArray(VAO);
  glBindBuffer(GL_ARRAY_BUFFER, VBO);
  glBufferData(GL_ARRAY_BUFFER, sizeof(float)*V.size(), V.data(), GL_STATIC_DRAW);
  glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, EBO);
  glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(GLuint)*F.size(), F.data(), GL_STATIC_DRAW);
  glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(GLfloat), (GLvoid*)0);
  glEnableVertexAttribArray(0);
  glBindBuffer(GL_ARRAY_BUFFER, 0); 
  glBindVertexArray(0);

  double t0 = get_seconds();
  const auto draw = [&]()
  {
    double tic = get_seconds();
    // clear screen and set viewport
    glClearColor(0.1,0.1,0.1,0.);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    glViewport(0,0,w*highdpi,h*highdpi);
    // Projection and modelview matrices
    Eigen::Matrix4f proj;
    float near = 0.01;
    float far = 100;
    float top = tan(35./360.*M_PI)*near;
    float right = top * (double)::w/(double)::h;
    float left = -right;
    float bottom = -top;
    proj.setConstant(4,4,0.);
    proj(0,0) = (2.0 * near) / (right - left);
    proj(1,1) = (2.0 * near) / (top - bottom);
    proj(0,2) = (right + left) / (right - left);
    proj(1,2) = (top + bottom) / (top - bottom);
    proj(2,2) = -(far + near) / (far - near);
    proj(3,2) = -1.0;
    proj(2,3) = -(2.0 * far * near) / (far - near);
    Eigen::Affine3f model = Eigen::Affine3f::Identity();
    model.translate(Eigen::Vector3f(0,0,-1.5));
    // select program and attach uniforms
    glUseProgram(prog_id);
    GLint proj_loc = glGetUniformLocation(prog_id,"proj");
    glUniformMatrix4fv(proj_loc,1,GL_FALSE,proj.data());
    GLint model_loc = glGetUniformLocation(prog_id,"model");
    glUniformMatrix4fv(model_loc,1,GL_FALSE,model.matrix().data());
    GLint t_loc = glGetUniformLocation(prog_id,"t");
    glUniform1f(t_loc,tic-t0);
    GLint m_loc = glGetUniformLocation(prog_id,"m");
    glUniform1i(m_loc,m);
    // Draw mesh as wireframe
    glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
    glBindVertexArray(VAO);
    glDrawElements(GL_TRIANGLES, F.size(), GL_UNSIGNED_INT, 0);
    glBindVertexArray(0);
  };

  // Main display routine
  while (!glfwWindowShouldClose(window))
  {
    double tic = get_seconds();
    static size_t count = 0;
    static double t_prev = get_seconds();
    if(tic-t_prev > 1)
    {
      const double fps = double(count)/(tic-t_prev);
      std::stringstream ss;
      ss <<  fps <<" FPS";
      glfwSetWindowTitle(window, ss.str().c_str());
      count = 0;
      t_prev = tic;
    }
    count++;
    draw();

    glfwSwapBuffers(window);
    glfwPollEvents();
  }
  glfwDestroyWindow(window);
  glfwTerminate();
  return EXIT_SUCCESS;
}
genpfault
  • 51,148
  • 11
  • 85
  • 139
Alec Jacobson
  • 6,032
  • 5
  • 51
  • 88
  • not sure, but could be the implementation of "get_seconds"? Try using "glfwGetTime" – tuket Apr 30 '22 at 17:14
  • thanks, unfortunately that didn't change anything. – Alec Jacobson Apr 30 '22 at 18:51
  • I do not understand the votes to close. "The question should be updated to include desired behavior [accurate fps], a specific problem or error [conventional answers fail to give accurate fps on this code], and the shortest code necessary to reproduce the problem [code is provided]." – Alec Jacobson Apr 30 '22 at 18:53
  • 1
    Anything change if you drop a `glFinish()` after `glfwSwapBuffers()`? – genpfault Apr 30 '22 at 20:31
  • A related explanation on the topic of [Sync Objects](https://www.khronos.org/opengl/wiki/Sync_Object) was posted over [there](https://stackoverflow.com/a/32958524/5872574). – rel Apr 30 '22 at 20:48
  • 1
    @genpfault, wow, yes! a lot changes. It actually makes this example run (visually) at 30 fps. And if I crank up the dummy computation, the visual FPS drops and so does the computed FPS. Could you explain why this worked (perhaps in an answer so I can accept it?)? – Alec Jacobson Apr 30 '22 at 23:14
  • @genpfault, also, any reason not to _always_ have `glFinish()` after `glfwSwapBuffers()` ? – Alec Jacobson Apr 30 '22 at 23:35
  • 1
    To properly measure shaders You have to forget CPU side time and fps ... instead use this [simple `GL_TIMESTAMP` example](https://stackoverflow.com/a/37490211/2521214) which is time measuring api provided directly by GL for exactly purposes like this ... – Spektre May 02 '22 at 11:52
  • Does this answer your question? [Benchmarking GLSL shaders to compare speed of alternative implementations](https://stackoverflow.com/questions/37482393/benchmarking-glsl-shaders-to-compare-speed-of-alternative-implementations) – Spektre May 02 '22 at 11:54
  • Thanks, Spektre. I'll take a look. At the end of the day, I really do care about the framerate. So, it's useful to have finer grain detail on the speed of the shader, but ideally I'd like to know the (full) time to display a frame. – Alec Jacobson May 02 '22 at 22:02

1 Answers1

1

GPU execution is highly parallelised and asynchronous, so timing it in the way you would CPU code is not going to work. Your GPU vendor will have profiling tools you can download which can provide a better insight than this kind of simple time measuring.

Andrea
  • 19,134
  • 4
  • 43
  • 65
  • 1
    Thanks for the hint! On my mac, I tried Instruments.app > Metal System Trace then looked at Display (Average Frame Time). Indeed, when the computation is too large and I start to see dropped frames the time is listed ≈700ms (1.4 fps). I wish there was a way to instrument this directly in my app... – Alec Jacobson Apr 30 '22 at 19:54