From 2d9e38dbe511f3b06a5b573978ba04bf4257bf88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timur=20Krist=C3=B3f?= <timur.kristof@gmail.com>
Date: Mon, 15 Apr 2024 12:31:01 +0200
Subject: [PATCH] radv: Calculate VRAM tess patch size independently of LDS
 size.

We recently made some effort to reduce the LDS use of TCS:
The lowering no longer uses the same output location mapping when
storing TCS outputs to LDS and VRAM. This means that the same
patch will use a different amount of LDS and VRAM.

Therefore, we need to properly calculate the patch size in VRAM
when determining the number of output patches.

Fixes: 0e481a4adcd8006256c27d100a0a0f0c01a94171
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28739>
---
 src/amd/vulkan/radv_cmd_buffer.c        |  3 ++-
 src/amd/vulkan/radv_pipeline_graphics.c |  9 +++++++++
 src/amd/vulkan/radv_shader.h            | 13 ++++++++----
 src/amd/vulkan/radv_shader_info.c       | 27 ++++++++++++++++++++++---
 src/amd/vulkan/radv_shader_info.h       |  3 +++
 5 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 0422ca70f980f..abafd7663d5de 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2649,7 +2649,8 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
       cmd_buffer->state.tess_num_patches = get_tcs_num_patches(
          d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
          tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
-         pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level, pdev->info.family);
+         tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
+         pdev->info.gfx_level, pdev->info.family);
 
       /* Compute the LDS size. */
       cmd_buffer->state.tess_lds_size =
diff --git a/src/amd/vulkan/radv_pipeline_graphics.c b/src/amd/vulkan/radv_pipeline_graphics.c
index cc90508927514..76fbbe60ea4c4 100644
--- a/src/amd/vulkan/radv_pipeline_graphics.c
+++ b/src/amd/vulkan/radv_pipeline_graphics.c
@@ -1425,7 +1425,16 @@ radv_link_tcs(const struct radv_device *device, struct radv_shader_stage *tcs_st
    const uint64_t io_mask = radv_gather_unlinked_io_mask(nir_mask);
    const unsigned num_reserved_outputs = util_last_bit64(io_mask);
 
+   /* Count the number of per-patch output slots we need to reserve for the TCS and TES.
+    * This is necessary because we need it to determine the patch size in VRAM.
+    */
+   const uint64_t patch_io_mask = radv_gather_unlinked_patch_io_mask(
+      tcs_stage->nir->info.outputs_written & tes_stage->nir->info.inputs_read,
+      tcs_stage->nir->info.patch_outputs_written & tes_stage->nir->info.patch_inputs_read);
+   const unsigned num_reserved_patch_outputs = util_last_bit64(patch_io_mask);
+
    tcs_stage->info.tcs.num_linked_outputs = num_reserved_outputs;
+   tcs_stage->info.tcs.num_linked_patch_outputs = num_reserved_patch_outputs;
    tcs_stage->info.outputs_linked = true;
 
    tes_stage->info.tes.num_linked_inputs = num_reserved_outputs;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index d3db1a635ea24..4adad6193899c 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -672,8 +672,9 @@ calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_ver
 
 static inline unsigned
 get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
-                    unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs,
-                    unsigned tess_offchip_block_dw_size, enum amd_gfx_level gfx_level, enum radeon_family family)
+                    unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs,
+                    unsigned tcs_num_vram_patch_outputs, unsigned tess_offchip_block_dw_size,
+                    enum amd_gfx_level gfx_level, enum radeon_family family)
 {
    uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
    uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
@@ -681,6 +682,10 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
    uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size;
    uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16;
 
+   uint32_t vram_output_vertex_size = tcs_num_vram_outputs * 16;
+   uint32_t vram_pervertex_output_patch_size = tcs_num_output_vertices * vram_output_vertex_size;
+   uint32_t vram_output_patch_size = vram_pervertex_output_patch_size + tcs_num_vram_patch_outputs * 16;
+
    /* Ensure that we only need one wave per SIMD so we don't need to check
     * resource usage. Also ensures that the number of tcs in and out
     * vertices per threadgroup are at most 256.
@@ -702,8 +707,8 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
    if (input_patch_size + lds_output_patch_size)
       num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + lds_output_patch_size));
    /* Make sure the output data fits in the offchip buffer */
-   if (lds_output_patch_size)
-      num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / lds_output_patch_size);
+   if (vram_output_patch_size)
+      num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_output_patch_size);
    /* Not necessary for correctness, but improves performance. The
     * specific value is taken from the proprietary driver.
     */
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
index 5808bd07518a4..21739d37b34c7 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -471,6 +471,23 @@ radv_gather_unlinked_io_mask(const uint64_t nir_io_mask)
    return radv_io_mask;
 }
 
+uint64_t
+radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask)
+{
+   uint64_t radv_io_mask = 0;
+   u_foreach_bit64 (semantic, nir_patch_io_mask) {
+      radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(semantic + VARYING_SLOT_PATCH0));
+   }
+
+   /* Tess levels need to be handled separately because they are not part of patch_outputs_written. */
+   if (nir_io_mask & VARYING_BIT_TESS_LEVEL_OUTER)
+      radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_OUTER));
+   if (nir_io_mask & VARYING_BIT_TESS_LEVEL_INNER)
+      radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_INNER));
+
+   return radv_io_mask;
+}
+
 static void
 gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
                       const struct radv_graphics_state_key *gfx_state, const struct radv_shader_stage_key *stage_key,
@@ -538,16 +555,20 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,
 
    if (!info->inputs_linked)
       info->tcs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read));
-   if (!info->outputs_linked)
+   if (!info->outputs_linked) {
       info->tcs.num_linked_outputs = util_last_bit64(radv_gather_unlinked_io_mask(
          nir->info.outputs_written & ~(VARYING_BIT_TESS_LEVEL_OUTER | VARYING_BIT_TESS_LEVEL_INNER)));
+      info->tcs.num_linked_patch_outputs = util_last_bit64(
+         radv_gather_unlinked_patch_io_mask(nir->info.outputs_written, nir->info.patch_outputs_written));
+   }
 
    if (gfx_state->ts.patch_control_points) {
       /* Number of tessellation patches per workgroup processed by the current pipeline. */
       info->num_tess_patches = get_tcs_num_patches(
          gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
-         info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
-         pdev->info.gfx_level, pdev->info.family);
+         info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs,
+         info->tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level,
+         pdev->info.family);
 
       /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
       info->tcs.num_lds_blocks =
diff --git a/src/amd/vulkan/radv_shader_info.h b/src/amd/vulkan/radv_shader_info.h
index 425b50254d101..db295d0442ac5 100644
--- a/src/amd/vulkan/radv_shader_info.h
+++ b/src/amd/vulkan/radv_shader_info.h
@@ -232,6 +232,7 @@ struct radv_shader_info {
       uint32_t num_lds_blocks;
       uint8_t num_linked_inputs;          /* Number of reserved per-vertex input slots in LDS. */
       uint8_t num_linked_outputs;         /* Number of reserved per-vertex output slots in VRAM. */
+      uint8_t num_linked_patch_outputs;   /* Number of reserved per-patch output slots in VRAM. */
       uint8_t num_lds_per_vertex_outputs; /* Number of reserved per-vertex output slots in LDS. */
       uint8_t num_lds_per_patch_outputs;  /* Number of reserved per-patch output slots in LDS. */
       bool tes_reads_tess_factors : 1;
@@ -267,4 +268,6 @@ enum ac_hw_stage radv_select_hw_stage(const struct radv_shader_info *const info,
 
 uint64_t radv_gather_unlinked_io_mask(const uint64_t nir_mask);
 
+uint64_t radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask);
+
 #endif /* RADV_SHADER_INFO_H */