2

I’m working on a Metal, MTKView based app that takes advantage of the A11 TBDR architecture to do deferred shading in a single render pass. I used Apple’s Deferred Lighting sample code as reference, and it works great.

I’d like to try changing the geometry buffer pass to be GPU-driven, using the Indirect Command Buffer feature of Metal 2 on A11 hardware.

I’ve been using Apple’s Encoding Indirect Command Buffers on the GPU sample code as my main point of reference for this. I’m able to run this sample on my iPhone XR (although, probably off-topic, the scrolling is not smooth, it judders).

I’m running into difficulties however with my own code, when I try to move my geometry buffer pass into an indirect command buffer. When I set supportIndirectCommandBuffers to true on the MTLRenderPipelineDescriptor of the Geometry Buffer pipeline, device.makeRenderPipelineState fails with the error

AGXMetalA12 Code=3 "Fragment shader cannot be used with indirect command buffers"

I’ve not been able to find any information in the documentation on this error. I’m wondering, are there certain kinds of fragment operation that are not allowed in indirect pipelines, or some kind of limit to GPU-driven drawing that I've overlooked (the number of color attachments perhaps)?

SharedTypes.h

Header shared by Metal and Swift

#ifndef SharedTypes_h
#define SharedTypes_h

#ifdef __METAL_VERSION__

#define NS_CLOSED_ENUM(_type, _name) enum _name : _type _name; enum _name : _type
#define NSInteger metal::int32_t

#else

#import <Foundation/Foundation.h>

#endif

#include <simd/simd.h>

typedef struct {
    uint32_t meshId;
    matrix_float3x3 normalViewMatrix;
    matrix_float4x4 modelMatrix;
    matrix_float4x4 shadowMVPTransformMatrix;
} InstanceData;

typedef struct {
    vector_float3 cameraPosition;
    float voxelScale;
    float blockScale;
    vector_float3 lightDirection;
    matrix_float4x4 viewMatrix;
    matrix_float4x4 projectionMatrix;
    matrix_float4x4 projectionMatrixInverse;
    matrix_float4x4 shadowViewProjectionMatrix;
} VoxelUniforms;

typedef NS_CLOSED_ENUM(NSInteger, BufferIndex)
{
    BufferIndexInstances  = 0,
    BufferIndexVertices = 1,
    BufferIndexIndices = 2,
    BufferIndexVoxelUniforms = 3,
};

typedef NS_CLOSED_ENUM(NSInteger, RenderTarget)
{
    RenderTargetLighting = 0,
    RenderTargetNormal_shadow = 1,
    RenderTargetVoxelIndex = 2,
    RenderTargetDepth = 3,
};

#endif /* SharedTypes_h */

GBuffer shader

#include <metal_stdlib>
using namespace metal;
#include "../SharedTypes.h"

struct VertexIn {
    packed_half3 position;
    packed_half3 texCoord3D;
    half ambientOcclusion;
    uchar normalIndex;
};

struct VertexInOut {
    float4 position [[ position ]];
    half3 worldPos;
    half3 eyeNormal;
    half3 localPosition;
    half3 localNormal;
    float eyeDepth;
    float3 shadowCoord;
    half3 texCoord3D;
};

vertex VertexInOut gBufferVertex(device InstanceData* instances [[ buffer( BufferIndexInstances ) ]],
                                 device VertexIn* vertices [[ buffer( BufferIndexVertices ) ]],
                                 constant VoxelUniforms &uniforms [[ buffer( BufferIndexVoxelUniforms ) ]],
                                 uint vid [[ vertex_id ]],
                                 ushort iid [[ instance_id ]])
{
    InstanceData instance = instances[iid];
    VertexIn vert = vertices[vid];
    VertexInOut out;
    float4 position = float4(float3(vert.position), 1);
    float4 worldPos = instance.modelMatrix * position;
    float4 eyePosition = uniforms.viewMatrix * worldPos;
    out.position = uniforms.projectionMatrix * eyePosition;
    out.worldPos = half3(worldPos.xyz);
    out.eyeDepth = eyePosition.z;

    half3 normal = normals[vert.normalIndex];
    out.eyeNormal = half3(instance.normalViewMatrix * float3(normal));
    out.shadowCoord = (instance.shadowMVPTransformMatrix * position).xyz;

    out.localPosition = half3(vert.position);
    out.localNormal = normal;
    out.texCoord3D = half3(vert.texCoord3D);
    return out;
}

fragment GBufferData gBufferFragment(VertexInOut in [[ stage_in ]],
                                     constant VoxelUniforms &uniforms [[ buffer( BufferIndexVoxelUniforms ) ]],
                                     texture3d<ushort, access::sample> voxelMap [[ texture(0) ]],
                                     depth2d<float> shadowMap [[ texture(1) ]],
                                     texture3d<half, access::sample> fogOfWarMap [[ texture(2) ]]
                                     ) {
    // voxel index
    half3 center = round(in.texCoord3D);
    uchar voxIndex = voxelMap.read(ushort3(center)).r - 1;

    // ambient occlusion
    half3 neighborPos = center + in.localNormal;
    half3 absNormal = abs(in.localNormal);
    half2 texCoord2D = tc2d(in.localPosition / uniforms.voxelScale, absNormal);
    half ao = getAO(voxelMap, neighborPos, absNormal.yzx, absNormal.zxy, texCoord2D);

    // shadow
    constexpr sampler shadowSampler(coord::normalized,
                                    filter::linear,
                                    mip_filter::none,
                                    address::clamp_to_edge,
                                    compare_func::less);

    float shadow_sample = ambientLightingLevel;
    for (short i = 0; i < shadowSampleCount; i++){
        shadow_sample += shadowMap.sample_compare(shadowSampler, in.shadowCoord.xy + poissonDisk[i] * 0.002, in.shadowCoord.z - 0.0018) * shadowContributionPerSample;
    }
    shadow_sample = min(1.0, shadow_sample);

    //fog-of-war
    half fogOfWarSample = fogOfWarMap.sample(fogOfWarSampler, (float3(in.worldPos) / uniforms.blockScale) + float3(0.5, 0.4, 0.5)).r;
    half notVisible = max(fogOfWarSample, 0.5h);

    // output
    GBufferData out;
    out.normal_shadow = half4(in.eyeNormal, ao * half(shadow_sample) * notVisible);
    out.voxelIndex = voxIndex;
    out.depth = in.eyeDepth;
    return out;
};

Pipeline setup

extension RenderTarget {

    var pixelFormat: MTLPixelFormat {
        switch self {
        case .lighting: return .bgra8Unorm
        case .normal_shadow: return .rgba8Snorm
        case .voxelIndex: return .r8Uint
        case .depth: return .r32Float
        }
    }

    static var allCases: [RenderTarget] = [.lighting, .normal_shadow, .voxelIndex, .depth]
}

public final class GBufferRenderer {
    private let renderPipelineState: MTLRenderPipelineState
    weak var shadowMap: MTLTexture?

    public init(depthPixelFormat: MTLPixelFormat, colorPixelFormat: MTLPixelFormat, sampleCount: Int = 1) throws {
        let library = try LibraryMonad.getLibrary()
        let device = library.device
        let descriptor = MTLRenderPipelineDescriptor()
        descriptor.vertexFunction = library.makeFunction(name: "gBufferVertex")!
        descriptor.fragmentFunction = library.makeFunction(name: "gBufferFragment")!
        descriptor.depthAttachmentPixelFormat = depthPixelFormat
        descriptor.stencilAttachmentPixelFormat = depthPixelFormat
        descriptor.sampleCount = sampleCount
        for target in RenderTarget.allCases {
            descriptor.colorAttachments[target.rawValue].pixelFormat = target.pixelFormat
        }
        // uncomment below to trigger throw
        // descriptor.supportIndirectCommandBuffers = true
        renderPipelineState = try device.makeRenderPipelineState(descriptor: descriptor) // throws "Fragment shader cannot be used with indirect command buffers"
    }

    public convenience init(mtkView: MTKView) throws {
        try self.init(depthPixelFormat: mtkView.depthStencilPixelFormat, colorPixelFormat: mtkView.colorPixelFormat, sampleCount: mtkView.sampleCount)
    }
}

The above works great when triggering draws from the CPU in the usual way, but when setting supportIndirectCommandBuffers in preparation for GPU drawing it throws the error.

I've tried stripping down the fragment shader to just return constant values for the GBuffers, and then makeRenderPipelineState succeeds, but when I add texture sampling back in it begins complaining again. I can't seem to pin down what exactly it doesn't like about the frag shader.

OliverD
  • 1,074
  • 13
  • 19

1 Answers1

1

Looking through the code and through Metal documentation and Metal Shading Language specification, I think I know why you get this error.

If you look through render_command interface that is present in metal_command_buffer header in Metal, you'll find that to pass parameters to indirect render commands, you only have these functions: set_vertex_buffer and set_fragment_buffer, there is no set_vertex_texture or set_vertex_sampler like you have in MTLRenderCommandEncoder.

But, since your pipeline uses shader that in turn uses textures as arguments and you indicate by using supportIndirectCommandBuffers that you would like to use this pipeline in indirect commands, Metal has no choice but to fail pipeline creation.

Instead if you want to pass textures or samplers to indirect render commands, you should use argument buffers, that you will pass to the shader that issues indirect render commands, which in turn will bind them using set_vertex_buffer and set_fragment_buffer for each render_command.

Specification: Metal Shading Language Specification (Section 5.16)

JustSomeGuy
  • 3,677
  • 1
  • 23
  • 31
  • I think you could be right. When I switch from using textures to buffers, I am able to create the renderPipelineState. The next issue though is that Metal compiler crashes when it tries to create the indirect drawing kernel "MTLCompiler: Compilation failed with XPC_ERROR_CONNECTION_INTERRUPTED". I think that might be a topic for another Stack Overflow question tho, marking this as the correct answer. I think it's quite tricky to port an existing pipeline to being GPU driven, probably better to start over from scratch. – OliverD Apr 04 '19 at 07:39
  • It seems that it's a functionality that people rarely use since it's so in-depth, so there may be many bugs still to find. If nothing works, try submitting bug report to Apple directly at https://bugreport.apple.com/ – JustSomeGuy Apr 04 '19 at 07:43
  • One thing I don't get is how indexed drawing is meant to work if you're packing the vertices and indices for all geometries into single buffers. Compared to the CPU version, the GPU command is missing the `indexStart` property: `void draw_indexed_primitives(primitive_type type, uint index_count, device/constant ushort/uint *index_buffer, uint instance_count, uint base_vertex, uint base_instance);` – OliverD Apr 04 '19 at 07:49
  • That's a really good question. I haven't tried, but possibly you could just do pointer arithmetic on `index_buffer`, since it should be allowed. So you'll just pass `my_index_buffer + index_start` for `index_buffer` argument. Hard to tell, I want to try prototype GPU driven pipeline and other stuff in my engine, but it's far from ready for rapid prototyping. – JustSomeGuy Apr 04 '19 at 07:54
  • I managed to fix the issue where MTLCompiler crashed when compiling the kernel. It was because my argument buffer had an array of textures using C array syntax, `texture3d voxelMaps [MaxVoxelMeshCount];`, texture arrays in argument buffers apparently need to use the `array` syntax: `array, MaxVoxelMeshCount> voxelMaps;`. Probably should be a compile-time error rather than a compiler crash though, so I might file a radar for that. – OliverD Apr 04 '19 at 19:36
  • Spoke to soon, forgot I had commented out the line that sets the texture buffer `cmd.set_fragment_buffer(textureData, BufferIndexTextureData);` If I uncomment this, I get the compiler crash. Might open a new SO question for this. I cannot believe that there isn't a way to use textures in GPU-driven indirect drawing, that would be almost useless (but am also now suspicious that Apple's sample code for indirect drawing does not use textures) – OliverD Apr 04 '19 at 20:10
  • Don't forget to report a bug directly to Apple with a reproduce code, or they may never even find out that something is broken. They usually answer pretty quick – JustSomeGuy Apr 05 '19 at 11:48
  • I thought before raising a bug I'd find out whether it is intended behaviour, by asking in the Metal space on the Apple developer forums. Sometimes engineers from the GPU team answer questions: https://forums.developer.apple.com/message/355833 – OliverD Apr 05 '19 at 13:08
  • Updated forum link https://developer.apple.com/forums/thread/115442 – OliverD Jun 24 '20 at 15:47