radv: Calculate VRAM tess patch size independently of LDS size.

We recently made some effort to reduce the LDS use of TCS: The lowering no longer uses the same output location mapping when storing TCS outputs to LDS and VRAM. This means that the same patch will use a different amount of LDS and VRAM. Therefore, we need to properly calculate the patch size in VRAM when determining the number of output patches. Fixes: 0e481a4adc Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28739>
2024-04-15 12:31:01 +02:00 · 2024-04-15 12:31:01 +02:00 · 2d9e38dbe5
parent 8190a65c78
commit 2d9e38dbe5
5 changed files with 47 additions and 8 deletions
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -2649,7 +2649,8 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
      cmd_buffer->state.tess_num_patches = get_tcs_num_patches(
         d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
         tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
-         pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level, pdev->info.family);
+         tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
+         pdev->info.gfx_level, pdev->info.family);

      /* Compute the LDS size. */
      cmd_buffer->state.tess_lds_size =
--- a/src/amd/vulkan/radv_pipeline_graphics.c
+++ b/src/amd/vulkan/radv_pipeline_graphics.c
@ -1425,7 +1425,16 @@ radv_link_tcs(const struct radv_device *device, struct radv_shader_stage *tcs_st
   const uint64_t io_mask = radv_gather_unlinked_io_mask(nir_mask);
   const unsigned num_reserved_outputs = util_last_bit64(io_mask);

+   /* Count the number of per-patch output slots we need to reserve for the TCS and TES.
+    * This is necessary because we need it to determine the patch size in VRAM.
+    */
+   const uint64_t patch_io_mask = radv_gather_unlinked_patch_io_mask(
+      tcs_stage->nir->info.outputs_written & tes_stage->nir->info.inputs_read,
+      tcs_stage->nir->info.patch_outputs_written & tes_stage->nir->info.patch_inputs_read);
+   const unsigned num_reserved_patch_outputs = util_last_bit64(patch_io_mask);
+
   tcs_stage->info.tcs.num_linked_outputs = num_reserved_outputs;
+   tcs_stage->info.tcs.num_linked_patch_outputs = num_reserved_patch_outputs;
   tcs_stage->info.outputs_linked = true;

   tes_stage->info.tes.num_linked_inputs = num_reserved_outputs;
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -672,8 +672,9 @@ calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_ver

 static inline unsigned
 get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
-                    unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs,
-                    unsigned tess_offchip_block_dw_size, enum amd_gfx_level gfx_level, enum radeon_family family)
+                    unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs,
+                    unsigned tcs_num_vram_patch_outputs, unsigned tess_offchip_block_dw_size,
+                    enum amd_gfx_level gfx_level, enum radeon_family family)
 {
   uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
   uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
@ -681,6 +682,10 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
   uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size;
   uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16;

+   uint32_t vram_output_vertex_size = tcs_num_vram_outputs * 16;
+   uint32_t vram_pervertex_output_patch_size = tcs_num_output_vertices * vram_output_vertex_size;
+   uint32_t vram_output_patch_size = vram_pervertex_output_patch_size + tcs_num_vram_patch_outputs * 16;
+
   /* Ensure that we only need one wave per SIMD so we don't need to check
    * resource usage. Also ensures that the number of tcs in and out
    * vertices per threadgroup are at most 256.
@ -702,8 +707,8 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
   if (input_patch_size + lds_output_patch_size)
      num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + lds_output_patch_size));
   /* Make sure the output data fits in the offchip buffer */
-   if (lds_output_patch_size)
-      num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / lds_output_patch_size);
+   if (vram_output_patch_size)
+      num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_output_patch_size);
   /* Not necessary for correctness, but improves performance. The
    * specific value is taken from the proprietary driver.
    */
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@ -471,6 +471,23 @@ radv_gather_unlinked_io_mask(const uint64_t nir_io_mask)
   return radv_io_mask;
 }

+uint64_t
+radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask)
+{
+   uint64_t radv_io_mask = 0;
+   u_foreach_bit64 (semantic, nir_patch_io_mask) {
+      radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(semantic + VARYING_SLOT_PATCH0));
+   }
+
+   /* Tess levels need to be handled separately because they are not part of patch_outputs_written. */
+   if (nir_io_mask & VARYING_BIT_TESS_LEVEL_OUTER)
+      radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_OUTER));
+   if (nir_io_mask & VARYING_BIT_TESS_LEVEL_INNER)
+      radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_INNER));
+
+   return radv_io_mask;
+}
+
 static void
 gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
                      const struct radv_graphics_state_key *gfx_state, const struct radv_shader_stage_key *stage_key,
@ -538,16 +555,20 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,

   if (!info->inputs_linked)
      info->tcs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read));
-   if (!info->outputs_linked)
+   if (!info->outputs_linked) {
      info->tcs.num_linked_outputs = util_last_bit64(radv_gather_unlinked_io_mask(
         nir->info.outputs_written & ~(VARYING_BIT_TESS_LEVEL_OUTER | VARYING_BIT_TESS_LEVEL_INNER)));
+      info->tcs.num_linked_patch_outputs = util_last_bit64(
+         radv_gather_unlinked_patch_io_mask(nir->info.outputs_written, nir->info.patch_outputs_written));
+   }

   if (gfx_state->ts.patch_control_points) {
      /* Number of tessellation patches per workgroup processed by the current pipeline. */
      info->num_tess_patches = get_tcs_num_patches(
         gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
-         info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
-         pdev->info.gfx_level, pdev->info.family);
+         info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs,
+         info->tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level,
+         pdev->info.family);

      /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
      info->tcs.num_lds_blocks =
--- a/src/amd/vulkan/radv_shader_info.h
+++ b/src/amd/vulkan/radv_shader_info.h
@ -232,6 +232,7 @@ struct radv_shader_info {
      uint32_t num_lds_blocks;
      uint8_t num_linked_inputs;          /* Number of reserved per-vertex input slots in LDS. */
      uint8_t num_linked_outputs;         /* Number of reserved per-vertex output slots in VRAM. */
+      uint8_t num_linked_patch_outputs;   /* Number of reserved per-patch output slots in VRAM. */
      uint8_t num_lds_per_vertex_outputs; /* Number of reserved per-vertex output slots in LDS. */
      uint8_t num_lds_per_patch_outputs;  /* Number of reserved per-patch output slots in LDS. */
      bool tes_reads_tess_factors : 1;
@ -267,4 +268,6 @@ enum ac_hw_stage radv_select_hw_stage(const struct radv_shader_info *const info,

 uint64_t radv_gather_unlinked_io_mask(const uint64_t nir_mask);

+uint64_t radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask);
+
 #endif /* RADV_SHADER_INFO_H */