1

I've been trying to run the compute shader - prefix sum demo provided at:

https://github.com/openglsuperbible/sb7code/blob/master/src/prefixsum/prefixsum.cpp

I used the exact code:

#define NUM_ELEMENTS 2048

float random_float()
{
    static unsigned int seed = 0x13371337;

    float res;
    unsigned int tmp;

    seed *= 16807;

    tmp = seed ^ (seed >> 4) ^ (seed << 15);

    *((unsigned int *)&res) = (tmp >> 9) | 0x3F800000;

    return (res - 1.0f);
}

static int PrefixSum(int programHandle)
{
    GLuint  data_buffer[2];

    float input_data[NUM_ELEMENTS];
    float output_data[NUM_ELEMENTS];

    glGenBuffers(2, data_buffer);

    glBindBuffer(GL_SHADER_STORAGE_BUFFER, data_buffer[0]);
    glBufferData(GL_SHADER_STORAGE_BUFFER, NUM_ELEMENTS * sizeof(float), NULL, GL_DYNAMIC_DRAW);

    glBindBuffer(GL_SHADER_STORAGE_BUFFER, data_buffer[1]);
    glBufferData(GL_SHADER_STORAGE_BUFFER, NUM_ELEMENTS * sizeof(float), NULL, GL_DYNAMIC_COPY);

    int i;

    for (i = 0; i < NUM_ELEMENTS; i++)
    {
        input_data[i] = random_float();
    }

    glShaderStorageBlockBinding(programHandle, 0, 0);
    glShaderStorageBlockBinding(programHandle, 1, 1);

    float * ptr;

    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, data_buffer[0], 0, sizeof(float) * NUM_ELEMENTS);
    glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(float) * NUM_ELEMENTS, input_data);

    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, data_buffer[1], 0, sizeof(float) * NUM_ELEMENTS);

    glUseProgram(programHandle);
    glDispatchCompute(1, 1, 1);

    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    glFinish();

    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, data_buffer[1], 0, sizeof(float) * NUM_ELEMENTS);
    ptr = (float *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, sizeof(float) * NUM_ELEMENTS, GL_MAP_READ_BIT);

    char buffer[1024];
    sprintf(buffer, "SUM: %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f "
    "%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f %2.2f",
    ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7],
    ptr[8], ptr[9], ptr[10], ptr[11], ptr[12], ptr[13], ptr[14], ptr[15]);

    glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
} 

And this is the shader:

#version 430 core

layout (local_size_x = 1024) in;

layout (binding = 0) coherent readonly buffer block1
{
    float input_data[gl_WorkGroupSize.x];
};

layout (binding = 1) coherent writeonly buffer block2
{
    float output_data[gl_WorkGroupSize.x];
};

shared float shared_data[gl_WorkGroupSize.x * 2];

void main(void)
{
    uint id = gl_LocalInvocationID.x;
    uint rd_id;
    uint wr_id;
    uint mask;

    const uint steps = uint(log2(gl_WorkGroupSize.x)) + 1;
    uint step = 0;

    shared_data[id * 2] = input_data[id * 2];
    shared_data[id * 2 + 1] = input_data[id * 2 + 1];

    barrier();

    for (step = 0; step < steps; step++)
    {
        mask = (1 << step) - 1;
        rd_id = ((id >> step) << (step + 1)) + mask;
        wr_id = rd_id + 1 + (id & mask);

        shared_data[wr_id] += shared_data[rd_id];

        barrier();
    }

    output_data[id * 2] = shared_data[id * 2];
    output_data[id * 2 + 1] = shared_data[id * 2 + 1];
}

The problem is that the output is written in 1 out of 4 locations:

SUM: 0.70 0.00 0.00 0.00 1.69 0.00 0.00 0.00 1.81 0.00 0.00 0.00 2.59 0.00 0.00 0.00

This is the input:

    [0] 0.700959682 float
    [1] 0.837353945 float
    [2] 0.403481007 float
    [3] 0.856583834 float
    [4] 0.993326187 float
    [5] 0.727316380 float
    [6] 0.768217087 float
    [7] 0.0675410032    float
    [8] 0.112720609 float
    [9] 0.703838706 float
    [10]    0.365846157 float
    [11]    0.504367113 float
    [12]    0.778576016 float
    [13]    0.217134356 float
    [14]    0.944752693 float
    [15]    0.575236082 float
    [16]    0.795839429 float
    [17]    0.707037449 float
    [18]    0.181974053 float
    [19]    0.745973587 float
    [20]    0.281350732 float
markwalberg
  • 311
  • 2
  • 10
  • FYI: I don't believe you are using SSBOs in your compute shader in such a way that requires `coherent`. That would primarily be for cross-talk among work items though buffers, and all your cross-talk is happening through shared memory. That's not your problem, but it would probably improve performance. – Nicol Bolas Apr 19 '16 at 17:31

2 Answers2

1
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

The memory barrier specifies how you intend to access the object after the write, not how you wrote to it. You're going to read from the object by mapping it for reading, so you should say that. Specifically, you should use GL_BUFFER_UPDATE_BARRIER_BIT.

Also:

glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, data_buffer[1], 0, sizeof(float) * NUM_ELEMENTS);

That should just be glBindBuffer(GL_SHADER_STORAGE_BUFFER). You're binding it to map it, not to use in a storage operation.

Nicol Bolas
  • 449,505
  • 63
  • 781
  • 982
1

Solved: specifying the packing standard for the buffer solved the problem:

layout (std430, binding = 1) coherent writeonly buffer block2
{
    float output_data[gl_WorkGroupSize.x];
};
markwalberg
  • 311
  • 2
  • 10