SOLVED: Faster HLSL code? Wondering about lower CPU overhead when rendering quads in 3-space

Question

!!!UPDATE!!! Using the vertex shader to generate quads via DrawInstanced() calls definitely reduced CPU overhead and increased quads drawn per second. But there was much more performance to be found by using a combination of instanced drawing via a vertex shader that generates a point list, and a geometry shader that generates quads based on those points.

Thanks to @Soonts for not only recommending a faster way, but also for reminding me of conditional moves and unrolling loops.

Here is the geometry shader I created for sprites with 2D rotation:

cbuffer CB_PROJ {
    matrix camera;
};

/*  Reduced packet size -- 256x256 max atlas segments
     -------------------
FLOAT3  Sprite location                     // 12 bytes
FLOAT   Rotation                            // 16 bytes
FLOAT2  Scale                               // 24 bytes
UINT                                        // 28 bytes
    Fixed8p00  Texture X segment
    Fixed8p00  Texture X total segments
    Fixed8p00  Texture Y segment
    Fixed8p00  Texture Y total segments
.Following vertex data is only processed by the vertex shader.
UINT                                        // 32 bytes
    Fixed3p00  Squadron generation method
    Fixed7p00  Sprite stride
    Fixed8p14  X/Y distance between sprites
*/

struct VOut {
    float3 position : POSITION;
    float3 r_s : NORMAL;
    uint   bits : BLENDINDICES;
};

struct GOut {
    float4 pos : SV_Position;
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 tex : TEXCOORD;
    uint   pID : SV_PrimitiveID;
};

[maxvertexcount(4)]

void main(point VOut gin[1], uint pID : SV_PrimitiveID, inout TriangleStream<GOut> triStream) {
    GOut output;

    const uint   bits   = gin[0].bits;   
    const uint   ySegs  = (bits & 0x0FF000000) >> 24u;
    const uint  _yOS    = (bits & 0x000FF0000) >> 16u;
    const float  yOS    = 1.0f - float(_yOS) / float(ySegs);
    const float  yOSd   = rcp(float(ySegs));
    const uint   xSegs  = (bits & 0x00000FF00) >> 8u;
    const uint   _xOS   = (bits & 0x0000000FF);
    const float  xOS    = float(_xOS) / float(xSegs);
    const float  xOSd   = rcp(float(xSegs));
          float2 v;

    output.pID = pID;
    output.n = float3( 0.0f, 0.0f, -1.0f );
    
    output.position = gin[0].position;  // Translate
    v.x = -gin[0].r_s.y; v.y = -gin[0].r_s.z;   // Scale
    output.tex = float2(xOS, yOS);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x); // Rotate
    output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);    // Transform
    triStream.Append(output);
    
    output.position = gin[0].position;
    v.x = -gin[0].r_s.y; v.y = gin[0].r_s.z;
    output.tex = float2(xOS, yOS - yOSd);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
    output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);
    triStream.Append(output);
    
    output.position = gin[0].position;
    v.x = gin[0].r_s.y; v.y = -gin[0].r_s.z;
    output.tex = float2(xOS + xOSd, yOS);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
    output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);
    triStream.Append(output);
    
    output.position = gin[0].position;
    v.x = gin[0].r_s.y; v.y = gin[0].r_s.z;
    output.tex = float2(xOS + xOSd, yOS - yOSd);
    output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
    output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
    output.pos = mul(float4(output.position, 1.0f), camera);
    triStream.Append(output);
}

!!!ORIGINAL TEXT!!!

Last time I was coding, I had barely started learning Direct3D9c. Currently I'm hitting about 30K single-texture quads lit with 15 lights at about 450fps. I haven't learned instancing or geometry shading at all yet, and I'm trying to prioritise the order I learn things in for my needs, so I've only taken glances at them.

My first thought was to reduce the amount of vertex data being shunted to the GPU, so I changed the vertex structure to a FLOAT2 (for texture coords) and an UINT (for indexing), relying on 4x float3 constants in the vertex shader to define the corners of the quads.

I figured I could reduce the size of the vertex data further, and reduced each vertex unit to a single UINT containing a 2bit index (to reference the real vertexes of the quad), and 2x 15bit fixed-point numbers (yes, I'm showing my age but fixed-point still has it's value) representing offsets into atlas textures.

So far, so good, but I know bugger all about Direct3D11 and HLSL so I've been wondering if there's a faster way.

Here's the current state of my vertex shader:

cbuffer CB_PROJ
{
    matrix model;
    matrix modelViewProj;
};

struct VOut
{
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 texcoord : TEXCOORD;
    float4 pos : SV_Position;
};

static const float3 position[4] = { -0.5f, 0.0f,-0.5f,-0.5f, 0.0f, 0.5f, 0.5f, 0.0f,-0.5f, 0.5f, 0.0f, 0.5f };
    
// Index bitpattern: YYYYYYYYYYYYYYYXXXXXXXXXXXXXXXVV
//
// 00-01 .  uint2b   == Vertex index (0-3)
// 02-17 . fixed1p14 == X offset into atlas texture(s)
// 18-31 . fixed1p14 == Y offset into atlas texture(s)
//
VOut main(uint bitField : BLENDINDICES) {
    VOut output;
    
    const uint   i        = bitField & 0x03u;
    const uint   xStep    = (bitField >> 2) & 0x7FFFu;
    const uint   yStep    = (bitField >> 17);
    const float  xDelta   = float(xStep) * 0.00006103515625f;
    const float  yDelta   = float(yStep) * 0.00006103515625f;
    const float2 texCoord = float2(xDelta, yDelta);
    
    output.position = (float3) mul(float4(position[i], 1.0f), model);
    output.n = mul(float3(0.0f, 1.0f, 0.0f), (float3x3) model);
    output.texcoord = texCoord;
    output.pos = mul(float4(output.position, 1.0f), modelViewProj);
    
    return output;
}

My pixel shader for completeness:

Texture2D Texture : register(t0);

SamplerState Sampler : register(s0);

struct LIGHT {
    float4 lightPos; // .w == range
    float4 lightCol; // .a == flags
};

cbuffer cbLight {
    LIGHT l[16] : register(b0); // 256 bytes
}

static const float3 ambient = { 0.15f, 0.15f, 0.15f };

float4 main(float3 position : POSITION, float3 n : NORMAL, float2 TexCoord : TEXCOORD) : SV_Target
{
    const float4 Texel = Texture.Sample(Sampler, TexCoord);

    if (Texel.a < 0.707106f) discard; // My source images have their alpha values inverted.

    float3 result = { 0.0f, 0.0f, 0.0f };

    for (uint xx = 0 ; xx < 16 && l[xx].lightCol.a != 0xFFFFFFFF; xx++)
    {
        const float3 lCol    = l[xx].lightCol.rgb;
        const float  range   = l[xx].lightPos.w;
        const float3 vToL    = l[xx].lightPos.xyz - position;
        const float  distToL = length(vToL);
        
        if (distToL < range * 2.0f)
        {
            const float  att = min(1.0f, (distToL / range + distToL / (range * range)) * 0.5f);
            const float3 lum = Texel.rgb * saturate(dot(vToL / distToL, n)) * lCol;
            result += lum * (1.0f - att);
        }
    }
    return float4(ambient * Texel.rgb + result, Texel.a);
}

And the rather busy looking C function to generate the vertex data (all non-relevant functions removed):

al16 struct CLASS_PRIMITIVES {
    ID3D11Buffer* pVB = { NULL, NULL }, * pIB = { NULL, NULL };
    const UINT strideV1 = sizeof(VERTEX1);

    void CreateQuadSet1(ui32 xSegs, ui32 ySegs) {
        al16 VERTEX1* vBuf;
        al16 D3D11_BUFFER_DESC bd = {};
             D3D11_SUBRESOURCE_DATA srd = {};
             ui32 index = 0, totalVerts = xSegs * ySegs * 4;

        if (pVB) return;
        vBuf = (VERTEX1*)_aligned_malloc(strideV1 * totalVerts, 16);
        for (ui32 yy = ySegs; yy; yy--)
            for (ui32 xx = 0; xx < xSegs; xx++) {
                double dyStep2 = 16384.0 / double(ySegs); double dyStep1 = dyStep2 * double(yy); dyStep2 *= double(yy - 1);
                ui32 yStep1 = dyStep1;
                yStep1 <<= 17;
                ui32 yStep2 = dyStep2;
                yStep2 <<= 17;
                vBuf[index].b = 0 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep1;
                index++;
                vBuf[index].b = 1 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep2;
                index++;
                vBuf[index].b = 2 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep1;
                index++;
                vBuf[index].b = 3 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep2;
                index++;
            }
        bd.Usage = D3D11_USAGE_IMMUTABLE;
        bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
        bd.CPUAccessFlags = 0;
        bd.ByteWidth = strideV1 * totalVerts;
        bd.StructureByteStride = strideV1;
        srd.pSysMem = vBuf;
        hr = dev->CreateBuffer(&bd, &srd, &pVB);
        if (hr != S_OK) ThrowError();
        _aligned_free(vBuf);
    };

    void DrawQuadFromSet1(ui32 offset) {
        offset *= sizeof(VERTEX1) * 4;
        devcon->IASetVertexBuffers(0, 1, &pVB, &strideV1, &offset);
        devcon->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
        devcon->Draw(4, 0);
    };

    void DestroyQuadSet() {
        if (pVB) pVB->Release();
    };

It's all functioning as it should, but it just seems like I'm resorting to hacks to achieve my goal. Surely there's a faster way? Using DrawIndexed() consistently dropped the frame-rate by 1% so I switched back to non-indexed Draw calls.

Can you narrow this down a bit more? This is a huge amount of code to review. — tadman, Oct 25 '22 at 14:05
Have you profiled your code to look for obvious hot spots to focus on? — tadman, Oct 25 '22 at 14:05
The code works fine; I just posted it in case someone is wondering how I achieved sprite rendering with nothing more than a single UINT as vertex data. I'm curious if a geometry shader or instancing would be faster? I had a look and apparently it's too early to wrap my head around, or I'm just going too hard too fast. :shrugs: — Zenefess, Oct 25 '22 at 14:12
You're asking a lot of things at once here. Do you have an example of an implementation you like that you can unpack and understand better? There's all kinds of open-source 2D graphics libraries that use DirectX under the hood. There's other examples worth studying on [Shadertoy](https://www.shadertoy.com) as well. — tadman, Oct 25 '22 at 14:13
I've done my own timing routines (old habit, and it's all I know at the moment), and I'm not having any performance issues GPU side. CPU side, well, who couldn't do with more quads per second... — Zenefess, Oct 25 '22 at 14:15
I'd suggest getting a lot more familiar with the various profiling tools out there as they'll often zero in really quickly on hot spots in your code. There's no magic answer here, this stuff is intrinsically really hard and complicated. — tadman, Oct 25 '22 at 14:16
You do have a loop inside a loop in your C code, which I'd suspect is where a lot of the pain comes from. This isn't the easiest code to follow, either. Why is `tmp` repeatedly assigned but never used? Why do you repeatedly convert to `double` the same values, over and over? — tadman, Oct 25 '22 at 14:17
Oh, this is for 3D. I'm implementing sprites to performance over visuals, as I'm working on adaptive swarm AI and I need something to represent each unit's location in 3-space without ruining the frame-rate. I wasn't looking to use someone else's intermediate API; I prefer to learn the 'under the hood' stuff. — Zenefess, Oct 25 '22 at 14:18
I mean I get that, but there's a lot of repetition here, which is usually a sign that something needs considerably more thinking and optimization. — tadman, Oct 25 '22 at 14:19
Also there's stuff like `vBuf = NULL; _aligned_free(vBuf);` which is like lighting a molotov and throwing it into the engine bay. Do you want to leak memory? There's a lot of weird things going on here that I'm sure you could resolve by maybe taking some time away from this code, and coming back with fresh eyes. — tadman, Oct 25 '22 at 14:19
Oh, my bad; I left some debug code in there for checking during breakpoints (I'll remove those now). The loop inside a loop is only for generating the "vertex" data that is sent to the GPU; it's irrelevant to the actual draw loop. The heavy casting is ensuring the compiler converts the integers separately, as the compiler was executing the bitshift out-of-order and returning zero. There's very little code in the DrawQuadFromSet1() function that draws a quad to the backbuffer. — Zenefess, Oct 25 '22 at 14:24
I mean I get that casting might be necessary, but a good habit to get into is to convert it once at the start of the loop, and use those converted values as much as possible, like `xSegs_d` or whatever instead. — tadman, Oct 25 '22 at 14:30
The vertex shader is down to approx. 22 instructions, which I'm pretty happy with, and reducing vertex data down to 32bits per vertex is as far as the GPU will allow, so there's not really anything more than can be done in the vertex shader. I am wondering if anyone has experience with using geometry shaders to auto-generate quads. I uploaded my vertex shader as a trade; posting the C code was just to show the packing of the data before it's sent to the GPU. — Zenefess, Oct 25 '22 at 14:33
I get what you're trying to do, but I think this is off-topic on Stack Overflow. The good news is there's a lot of shader-focused groups that can really dig into this and can take you to places you've only dreamed of performance-wise, but you'll need to roll this over to a place like that. Shader code is a bit of a mysterious art, but there are some exceptional talents out there that can and will help if you know where to look. — tadman, Oct 25 '22 at 14:35
I agree with your recommendation for such practice, and I do it when it's appropriate for what I'm doing, but minimising CPU cycles and memory accesses are always high priorities when I code. Having an assembler's brain is probably why I do that automatically. But I digress; the C code isn't important (as it only generates data once), and it was made for accuracy above all else. I'm interested if anyone (here & elsewhere) has a different way of implementing the shaders. — Zenefess, Oct 25 '22 at 14:43
Thanks for the recommendation, tadman. I'm checking a few places for different ideas, and when it's all coded & completed, I'm going to share the source code... I mean, if anyone is masochistic enough to want to wade through some ASM code. — Zenefess, Oct 25 '22 at 14:45
I should probably reword the "question" I put in the title of this post, yeah? You've just made me realise I didn't state this is about HLSL... :slaps forehead: — Zenefess, Oct 25 '22 at 14:47
@tadman Is there some way I can buy you a coffee or something like that? Just want to show my appreciation for your assistance. — Zenefess, Oct 25 '22 at 22:29
Nice idea about the geometry shader, I’m surprised it’s faster than instancing in your use case. Anyway, try this version of the GS https://gist.github.com/Const-me/b6930103b0193b2694a3375a0760a60e I’ve refactored your HLSL for readability, and applied some math optimizations, might become slightly faster. However I can’t test in isolation, could be bugs there. — Soonts, Oct 30 '22 at 19:27
Also, if you like assembly, download cmd_Decompiler-1.3.16.zip from there https://github.com/bo3b/3Dmigoto/releases Then run `cmd_decompiler -d OrientedQuadsGS.cso` to disassemble a compiled shader. Note that asm is for the Microsoft-defined byte code, not hardware instructions. There’s another compiler downstream, JIT in the user-mode half of your GPU driver, which produces proprietary hardware instructions from DXBC. Still, even that intermediate disassembly can be useful to find out what HLSL compiler is doing to your source code, and the ISA is documented by MS. — Soonts, Oct 30 '22 at 20:02
@Soonts I'm still using instancing, but instead of relying on the vertex shader to create vertexes, the V.S. is used to define duplication points (via DrawInstanced) which the geometry shader builds from. After spending 2 whole days researching geometry shaders, I have a much better understanding of the shader pipeline and how to use it efficiently. Back in 2008 geometry shading was quite primitive and limited, but now it's bloody excellent stuff! — Zenefess, Oct 30 '22 at 22:23
@Soonts Oh, I've been using Visual Studio's built-in graphics debugger. Has definitely saved me a lot of headaches, and being able to see the compiled ASM is great. Was nice to see I am successfully learning how the compiler interprets high-level code. I'm glad compilers aren't as... dumb... as they used to be. — Zenefess, Oct 30 '22 at 22:27
@Soonts Just looked at your rewrite of my G.S.. I can see some changes that are "compiler friendly" (eg. const uint4 bytes = ( bits >> uint4( 0, 8, 16, 24 ) ) & 0xFFu; ); I'll be sure to use those. Things like sincos() I didn't write myself because the (release build) ASM code interprets my use of sin & cos as being appropriate for sincos() code generation. My current shaders are resulting in >600mil quads per second, via a single CPU thread; I am very pleased with that kind of efficiency. Thanks again, mate! — Zenefess, Oct 30 '22 at 22:36

score 2 · Answer 1 · answered Oct 25 '22 at 14:46

2

reducing vertex data down to 32bits per vertex is as far as the GPU will allow

You seem to think that vertex buffer sizes are what's holding you back. Make no mistake here, they are not. You have many gigs of VRAM to work with, use them if it will make your code faster. Specifically, anything you're unpacking in your shaders that could otherwise be stored explicitly in your vertex buffer should probably be stored in your vertex buffer.

I am wondering if anyone has experience with using geometry shaders to auto-generate quads

I'll stop you right there, geometry shaders are very inefficient in most driver implementations, even today. They just aren't used that much so nobody bothered to optimize them.

One quick thing that jumps at me is that you're allocating and freeing your system-side vertex array every frame. Building it is fine, but cache the array, C memory allocation is about as slow as anything is going to get. A quick profiling should have shown you that.

Your next biggest problem is that you have a lot of branching in your pixel shader. Use standard functions (like clamp or mix) or blending to let the math cancel out instead of checking for ranges or fully transparent values. Branching will absolutely kill performance.

And lastly, make sure you have the correct hints and usage on your buffers. You don't show them, but they should be set to whatever the equivalent of GL_STREAM_DRAW is, and you need to ensure you don't corrupt the in-flight parts of your vertex buffer. Future frames will render at the same time as the current one as long as you don't invalidate their data by overwriting their vertex buffer, so instead use a round-robin scheme to allow as many vertices as possible to survive (again, use memory for performance). Personally I allocate a very large vertex buffer (5x the data a frame needs) and write it sequentially until I reach the end, at which point I orphan the whole thing and re-allocate it and start from the beginning again.

answered Oct 25 '22 at 14:46

Blindy

65,249
10
91
131

Thanks for the info on geometry shading. I'll give it a miss. The vertex buffer is only created once before the main rendering loop beings. Inside the rendering loop, it's basically: ClearRenderTargetView(), CleatDepthStencilView(), then the inner transform loop (Set matrix transforms, update their constant buffer, IASetVertexBuffers(), IASetPrimitveTopology, Draw). ...Unless one of those functions is shunting the vertex buffer to the GPU every frame? It's been a busy 2 weeks of learning... – Zenefess Oct 25 '22 at 14:59
Ah, you didn't show that, you only showed the create and draw functions independently so I assumed you were streaming the vertices. Never mind on that one then! – Blindy Oct 25 '22 at 15:01
No problem at all; my incompetence at asking questions is for me to stress about. :P I was restreaming the buffer in the beginning of my learning, but after I learned about making buffers immutable I switched to dumping data on the GPU once. Oh, as for me reducing the vertex data size; the reason was not to do with amount and/or speed of VRAM, but about minimising the data I sent over the PCIe bus. Bitshifting inside the vertex shader eliminated some pesky divides (amongst other things). I was happily surprised to have reduced the compiled shader code down to 22 instructions. – Zenefess Oct 25 '22 at 15:16
The problem is that those 22 instructions are very bad (TM). You need to get rid of those branches (the for loop condition, the nested if, the if at the beginning). And my point was that you don't need to minimize the PCIe bus transfer in this way, 50 bytes more or less won't make a difference. In fact, you mention indices -- I'm willing to bet money that if you removed them and did a straight draw array (or whatever the non-index draw for D3D is) you'll get a performance boost. – Blindy Oct 25 '22 at 15:43
The 22 instruction compile is for the vertex shader which doesn't use branching. I'll rewrite the pixel shader to eliminate the jumps and compare them, although my question is about CPU overhead; 30K lit-by-16-lights quads at about 450fps and the CPU thread is maxed-out but the GPU still has plenty of power left over. If the entire 30K quads are close enough to all 16 active lights, the GPU maxes-out and the frame-rate drops to about 78fps, so I know it's far from fast. I've only been learning shaders for about a week out of the 2 that I've been learning D3D11... – Zenefess Oct 25 '22 at 20:59
As for my mentioning indices: I'm not using indexed draw calls; I'm using the first 2 bits of a UINT to act as an index into the FLOAT3 constant "position[4]". I was up way too late last night, but I'll reprofile my previous vertex shader to compare the numbers again. The number of quad references created by CreateQuadSet1() has an upper limit of 16K x 16K quads (it's for generating texture offsets into atlas textures), but a more realistic 256 x 256 quads would be 1MB of vertex data. I know that still isn't much (my brain is still paranoid about code like it's still 14 years ago). – Zenefess Oct 25 '22 at 20:59
Anyway! Thank you for the tips. I'll let you know what proves to be faster. – Zenefess Oct 25 '22 at 20:59
Okay, I just tested my "packed UINT" vertex shader against my older one that uses a "FLOAT2, UINT" structure (and I removed the jumps from my pixel shader to keep GPU usage low). Well, CPU-side there was no difference in performance; number of Present() calls per second are the same for both. GPU-side, the "packed UINT" shader has better performance... by a negligible amount (for 30K quads, at least); it uses 2% less GPU-time per frame. I know; it really is bugger all improvement, but it's coded now and any performance gain is welcome. I need to work on my CPU witchcraft some more... – Zenefess Oct 25 '22 at 23:49

score 0 · Accepted Answer · answered Oct 26 '22 at 13:01

I think your code is CPU bound. While your approach has very small vertices, you have non-trivial API overhead.

A better approach is rendering all quads with a single draw call. I would probably use instancing for that.

Assuming you want arbitrary per-quad size, position, and orientation in 3D space, here’s one possible approach. Untested.

Vertex buffer elements:

struct sInstanceData
{
    // Center of the quad in 3D space
    XMFLOAT3 center;
    // XY coordinates of the sprite in the atlas
    uint16_t spriteX, spriteY;
    // Local XY vectors of the quad in 3D space
    // length of the vectors = half width/height of the quad
    XMFLOAT3 plusX, plusY;
};

Input layout:

D3D11_INPUT_ELEMENT_DESC desc[ 4 ];
desc[ 0 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadCenter", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 1 ] = D3D11_INPUT_ELEMENT_DESC{ "SpriteIndex", 0, DXGI_FORMAT_R16G16_UINT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 2 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusX", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 3 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusY", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };

Vertex shader:

cbuffer Constants
{
    matrix viewProj;
    // Pass [ 1.0 / xSegs, 1.0 / ySegs ] in that field
    float2 texcoordMul;
};

struct VOut
{
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 texcoord : TEXCOORD;
    float4 pos : SV_Position;
};

VOut main( uint index: SV_VertexID,
    float3 center : QuadCenter, uint2 texcoords : SpriteIndex,
    float3 plusX : QuadPlusX, float3 plusY : QuadPlusY )
{
    VOut result;
    float3 pos = center;
    int2 uv = ( int2 )texcoords;

    // No branches are generated in release builds;
    // only conditional moves are there
    if( index & 1 )
    {
        pos += plusX;
        uv.x++;
    }
    else
        pos -= plusX;

    if( index & 2 )
    {
        pos += plusY;
        uv.y++;
    }
    else
        pos -= plusY;

    result.position = pos;
    result.n = normalize( cross( plusX, plusY ) );
    result.texcoord = ( ( float2 )uv ) * texcoordMul;
    result.pos = mul( float4( pos, 1.0f ), viewProj );
    return result;
}

Rendering:

UINT stride = sizeof( sInstanceData );
UINT off = 0;
context->IASetVertexBuffers( 0, 1, &vb, &stride, &off );
context->IASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP );
context->DrawInstanced( 4, countQuads, 0, 0 );

Thanks for that, mate. Yeah, I'm thinking instancing is my next step. As fast as my vertex shader is, it doesn't mean much if I can't get the draw calls done any faster. Right now I'm learning deferred command lists (and I should be in bed), but I'll definitely focus on instancing tomorrow. Feel free to steal my vertex shader if it interests you. — Zenefess, Oct 26 '22 at 14:32

SOLVED: Faster HLSL code? Wondering about lower CPU overhead when rendering quads in 3-space

2 Answers2