It seems to me that I have to create a new GPUCommandBuffer every time I want to run the computation with different input values (either uniform or storage).
I tried loading new data into an existing staging buffer (with mapAsync) and running an existing command buffer (which starts with a copyBufferToBuffer). But the kernel only sees the old data, the new buffer never gets copied. Either that, or the computation never gets executed a second time.
If instead I create a new GPUCommandBuffer, identical to the old one, then it re-runs the computation using the new data.
What am I missing?
Here is the code. I'm using passEncoder.endPass()
because I'm on Firefox trunk, if you're on Chromium you may have to replace it with passEncoder.end()
if (!navigator.gpu) alert('WebGPU not supported.')
const adapter = await navigator.gpu.requestAdapter()
if (!adapter) alert('Couldn’t request WebGPU adapter.')
const device = await adapter.requestDevice()
if (!device) alert('Couldn’t request WebGPU device.')
// ------------------------------------------ GPU Kernel ------------------------------------------
// language=WGSL
const code = `
@group(0) @binding(1) var<storage, read> b1_input: array<f32>;
@group(0) @binding(2) var<storage, read_write> b2_output: array<f32>;
@compute @workgroup_size(64)
fn main(
@builtin(global_invocation_id) global_id: vec3<u32>,
) {
if (global_id.x >= arrayLength(&b1_input)) {
return;
}
b2_output[global_id.x] = b1_input[global_id.x] * 10.0;
}
`
// ---------------------------------------- Create buffers ----------------------------------------
const { MAP_READ, MAP_WRITE, COPY_DST, STORAGE, COPY_SRC } = GPUBufferUsage
const b1_input_size = 4 * 10
const b1_input_gpu = device.createBuffer({ size: b1_input_size, usage: STORAGE | COPY_DST })
const b1_input_stage = device.createBuffer({ size: b1_input_size, usage: MAP_WRITE | COPY_SRC })
const b2_output_size = 4 * 10
const b2_output_gpu = device.createBuffer({ size: b2_output_size, usage: STORAGE | COPY_SRC })
const b2_output_stage = device.createBuffer({ size: b2_output_size, usage: MAP_READ | COPY_DST })
// --------------------------------------- Create pipeline ----------------------------------------
const bindGroupLayout = device.createBindGroupLayout({
entries: [
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } },
],
})
const bindGroup = device.createBindGroup({
layout: bindGroupLayout,
entries: [
{ binding: 1, resource: { buffer: b1_input_gpu } },
{ binding: 2, resource: { buffer: b2_output_gpu } },
],
})
const pipeline = device.createComputePipeline({
layout: device.createPipelineLayout({ bindGroupLayouts: [bindGroupLayout] }),
compute: { module: device.createShaderModule({ code }), entryPoint: 'main' },
})
// -------------------------- Function to create a new GPUCommandBuffer ---------------------------
const workgroupCount = 1
function createCommandBuffer() {
const commandEncoder = device.createCommandEncoder()
commandEncoder.copyBufferToBuffer(b1_input_stage, 0, b1_input_gpu, 0, b1_input_size)
const passEncoder = commandEncoder.beginComputePass()
passEncoder.setPipeline(pipeline)
passEncoder.setBindGroup(0, bindGroup)
passEncoder.dispatchWorkgroups(workgroupCount)
passEncoder.endPass()
commandEncoder.copyBufferToBuffer(b2_output_gpu, 0, b2_output_stage, 0, b2_output_size)
return commandEncoder.finish()
}
// ------------------------------------------ Executions ------------------------------------------
document.write('<pre>')
let input = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
document.write(`Loading input = ${input}\n`)
await b1_input_stage.mapAsync(GPUMapMode.WRITE)
new Float32Array(b1_input_stage.getMappedRange()).set(input)
b1_input_stage.unmap()
document.write('\nExecuting GPUCommandBuffer\n')
let commands = createCommandBuffer()
device.queue.submit([commands])
await b2_output_stage.mapAsync(GPUMapMode.READ)
let output = new Float32Array(b2_output_stage.getMappedRange()).slice()
b2_output_stage.unmap()
document.write(`Output: ${output}\n\n`)
input = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
document.write(`Loading input = ${input}\n`)
await b1_input_stage.mapAsync(GPUMapMode.WRITE)
new Float32Array(b1_input_stage.getMappedRange()).set(input)
b1_input_stage.unmap()
document.write('\nExecuting the same GPUCommandBuffer\n')
device.queue.submit([commands])
await b2_output_stage.mapAsync(GPUMapMode.READ)
output = new Float32Array(b2_output_stage.getMappedRange()).slice()
b2_output_stage.unmap()
document.write(`Output: ${output} (WRONG?)\n`)
document.write('\nExecuting a new GPUCommandBuffer\n')
commands = createCommandBuffer()
device.queue.submit([commands])
await b2_output_stage.mapAsync(GPUMapMode.READ)
output = new Float32Array(b2_output_stage.getMappedRange()).slice()
b2_output_stage.unmap()
document.write(`Output: ${output}\n\n`)
Here is what I get:
Loading input = 1,2,3,4,5,6,7,8,9,10
Executing GPUCommandBuffer
Output: 10,20,30,40,50,60,70,80,90,100
Loading input = 2,4,6,8,10,12,14,16,18,20
Executing the same GPUCommandBuffer
Output: 10,20,30,40,50,60,70,80,90,100 (WRONG?)
Executing a new GPUCommandBuffer
Output: 20,40,60,80,100,120,140,160,180,200
By the way, the problem is not with read-only-storage
. I tried with all 3 buffer types and I get the same results.