0

It seems to me that I have to create a new GPUCommandBuffer every time I want to run the computation with different input values (either uniform or storage).

I tried loading new data into an existing staging buffer (with mapAsync) and running an existing command buffer (which starts with a copyBufferToBuffer). But the kernel only sees the old data, the new buffer never gets copied. Either that, or the computation never gets executed a second time.

If instead I create a new GPUCommandBuffer, identical to the old one, then it re-runs the computation using the new data.

What am I missing?

Here is the code. I'm using passEncoder.endPass() because I'm on Firefox trunk, if you're on Chromium you may have to replace it with passEncoder.end()

if (!navigator.gpu) alert('WebGPU not supported.')

const adapter = await navigator.gpu.requestAdapter()
if (!adapter) alert('Couldn’t request WebGPU adapter.')

const device = await adapter.requestDevice()
if (!device) alert('Couldn’t request WebGPU device.')

// ------------------------------------------ GPU Kernel ------------------------------------------

// language=WGSL
const code = `
  @group(0) @binding(1) var<storage, read>       b1_input: array<f32>;
  @group(0) @binding(2) var<storage, read_write> b2_output: array<f32>;

  @compute @workgroup_size(64)
  fn main(
    @builtin(global_invocation_id) global_id: vec3<u32>,
  ) {
    if (global_id.x >= arrayLength(&b1_input)) {
      return;
    }
    b2_output[global_id.x] = b1_input[global_id.x] * 10.0;
  }
`

// ---------------------------------------- Create buffers ----------------------------------------

const { MAP_READ, MAP_WRITE, COPY_DST, STORAGE, COPY_SRC } = GPUBufferUsage

const b1_input_size = 4 * 10
const b1_input_gpu = device.createBuffer({ size: b1_input_size, usage: STORAGE | COPY_DST })
const b1_input_stage = device.createBuffer({ size: b1_input_size, usage: MAP_WRITE | COPY_SRC })

const b2_output_size = 4 * 10
const b2_output_gpu = device.createBuffer({ size: b2_output_size, usage: STORAGE | COPY_SRC })
const b2_output_stage = device.createBuffer({ size: b2_output_size, usage: MAP_READ | COPY_DST })

// --------------------------------------- Create pipeline ----------------------------------------

const bindGroupLayout = device.createBindGroupLayout({
  entries: [
    { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } },
    { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } },
  ],
})

const bindGroup = device.createBindGroup({
  layout: bindGroupLayout,
  entries: [
    { binding: 1, resource: { buffer: b1_input_gpu } },
    { binding: 2, resource: { buffer: b2_output_gpu } },
  ],
})

const pipeline = device.createComputePipeline({
  layout: device.createPipelineLayout({ bindGroupLayouts: [bindGroupLayout] }),
  compute: { module: device.createShaderModule({ code }), entryPoint: 'main' },
})

// -------------------------- Function to create a new GPUCommandBuffer ---------------------------

const workgroupCount = 1

function createCommandBuffer() {
  const commandEncoder = device.createCommandEncoder()

  commandEncoder.copyBufferToBuffer(b1_input_stage, 0, b1_input_gpu, 0, b1_input_size)

  const passEncoder = commandEncoder.beginComputePass()
  passEncoder.setPipeline(pipeline)
  passEncoder.setBindGroup(0, bindGroup)
  passEncoder.dispatchWorkgroups(workgroupCount)
  passEncoder.endPass()

  commandEncoder.copyBufferToBuffer(b2_output_gpu, 0, b2_output_stage, 0, b2_output_size)

  return commandEncoder.finish()
}

// ------------------------------------------ Executions ------------------------------------------

document.write('<pre>')

let input = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
document.write(`Loading input = ${input}\n`)
await b1_input_stage.mapAsync(GPUMapMode.WRITE)
new Float32Array(b1_input_stage.getMappedRange()).set(input)
b1_input_stage.unmap()

document.write('\nExecuting GPUCommandBuffer\n')
let commands = createCommandBuffer()
device.queue.submit([commands])

await b2_output_stage.mapAsync(GPUMapMode.READ)
let output = new Float32Array(b2_output_stage.getMappedRange()).slice()
b2_output_stage.unmap()
document.write(`Output: ${output}\n\n`)

input = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
document.write(`Loading input = ${input}\n`)
await b1_input_stage.mapAsync(GPUMapMode.WRITE)
new Float32Array(b1_input_stage.getMappedRange()).set(input)
b1_input_stage.unmap()

document.write('\nExecuting the same GPUCommandBuffer\n')
device.queue.submit([commands])

await b2_output_stage.mapAsync(GPUMapMode.READ)
output = new Float32Array(b2_output_stage.getMappedRange()).slice()
b2_output_stage.unmap()
document.write(`Output: ${output} (WRONG?)\n`)

document.write('\nExecuting a new GPUCommandBuffer\n')
commands = createCommandBuffer()
device.queue.submit([commands])

await b2_output_stage.mapAsync(GPUMapMode.READ)
output = new Float32Array(b2_output_stage.getMappedRange()).slice()
b2_output_stage.unmap()
document.write(`Output: ${output}\n\n`)

Here is what I get:

Loading input = 1,2,3,4,5,6,7,8,9,10

Executing GPUCommandBuffer
Output: 10,20,30,40,50,60,70,80,90,100

Loading input = 2,4,6,8,10,12,14,16,18,20

Executing the same GPUCommandBuffer
Output: 10,20,30,40,50,60,70,80,90,100 (WRONG?)

Executing a new GPUCommandBuffer
Output: 20,40,60,80,100,120,140,160,180,200

By the way, the problem is not with read-only-storage. I tried with all 3 buffer types and I get the same results.

Tobia
  • 17,856
  • 6
  • 74
  • 93

1 Answers1

1

Only the GPURenderBundle can reuse the GPUCommandBuffer, but the GPURenderBundle is only used for rendering.

Looking at submit(...) in WebGPU Spec, it is explicitly mentioned that a submitted cmdBuffer cannot be reused:

Schedules the execution of the command buffers by the GPU on this queue.

Submitted command buffers cannot be used again.

Jinlei Li
  • 240
  • 6