From 2d9e38dbe511f3b06a5b573978ba04bf4257bf88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 15 Apr 2024 12:31:01 +0200 Subject: [PATCH] radv: Calculate VRAM tess patch size independently of LDS size. We recently made some effort to reduce the LDS use of TCS: The lowering no longer uses the same output location mapping when storing TCS outputs to LDS and VRAM. This means that the same patch will use a different amount of LDS and VRAM. Therefore, we need to properly calculate the patch size in VRAM when determining the number of output patches. Fixes: 0e481a4adcd8006256c27d100a0a0f0c01a94171 Part-of: --- src/amd/vulkan/radv_cmd_buffer.c | 3 ++- src/amd/vulkan/radv_pipeline_graphics.c | 9 +++++++++ src/amd/vulkan/radv_shader.h | 13 ++++++++---- src/amd/vulkan/radv_shader_info.c | 27 ++++++++++++++++++++++--- src/amd/vulkan/radv_shader_info.h | 3 +++ 5 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 0422ca70f980f..abafd7663d5de 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2649,7 +2649,8 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->state.tess_num_patches = get_tcs_num_patches( d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs, tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs, - pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level, pdev->info.family); + tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, + pdev->info.gfx_level, pdev->info.family); /* Compute the LDS size. */ cmd_buffer->state.tess_lds_size = diff --git a/src/amd/vulkan/radv_pipeline_graphics.c b/src/amd/vulkan/radv_pipeline_graphics.c index cc90508927514..76fbbe60ea4c4 100644 --- a/src/amd/vulkan/radv_pipeline_graphics.c +++ b/src/amd/vulkan/radv_pipeline_graphics.c @@ -1425,7 +1425,16 @@ radv_link_tcs(const struct radv_device *device, struct radv_shader_stage *tcs_st const uint64_t io_mask = radv_gather_unlinked_io_mask(nir_mask); const unsigned num_reserved_outputs = util_last_bit64(io_mask); + /* Count the number of per-patch output slots we need to reserve for the TCS and TES. + * This is necessary because we need it to determine the patch size in VRAM. + */ + const uint64_t patch_io_mask = radv_gather_unlinked_patch_io_mask( + tcs_stage->nir->info.outputs_written & tes_stage->nir->info.inputs_read, + tcs_stage->nir->info.patch_outputs_written & tes_stage->nir->info.patch_inputs_read); + const unsigned num_reserved_patch_outputs = util_last_bit64(patch_io_mask); + tcs_stage->info.tcs.num_linked_outputs = num_reserved_outputs; + tcs_stage->info.tcs.num_linked_patch_outputs = num_reserved_patch_outputs; tcs_stage->info.outputs_linked = true; tes_stage->info.tes.num_linked_inputs = num_reserved_outputs; diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index d3db1a635ea24..4adad6193899c 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -672,8 +672,9 @@ calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_ver static inline unsigned get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, - unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, - unsigned tess_offchip_block_dw_size, enum amd_gfx_level gfx_level, enum radeon_family family) + unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs, + unsigned tcs_num_vram_patch_outputs, unsigned tess_offchip_block_dw_size, + enum amd_gfx_level gfx_level, enum radeon_family family) { uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs); uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size; @@ -681,6 +682,10 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size; uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16; + uint32_t vram_output_vertex_size = tcs_num_vram_outputs * 16; + uint32_t vram_pervertex_output_patch_size = tcs_num_output_vertices * vram_output_vertex_size; + uint32_t vram_output_patch_size = vram_pervertex_output_patch_size + tcs_num_vram_patch_outputs * 16; + /* Ensure that we only need one wave per SIMD so we don't need to check * resource usage. Also ensures that the number of tcs in and out * vertices per threadgroup are at most 256. @@ -702,8 +707,8 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver if (input_patch_size + lds_output_patch_size) num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + lds_output_patch_size)); /* Make sure the output data fits in the offchip buffer */ - if (lds_output_patch_size) - num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / lds_output_patch_size); + if (vram_output_patch_size) + num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_output_patch_size); /* Not necessary for correctness, but improves performance. The * specific value is taken from the proprietary driver. */ diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 5808bd07518a4..21739d37b34c7 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -471,6 +471,23 @@ radv_gather_unlinked_io_mask(const uint64_t nir_io_mask) return radv_io_mask; } +uint64_t +radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask) +{ + uint64_t radv_io_mask = 0; + u_foreach_bit64 (semantic, nir_patch_io_mask) { + radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(semantic + VARYING_SLOT_PATCH0)); + } + + /* Tess levels need to be handled separately because they are not part of patch_outputs_written. */ + if (nir_io_mask & VARYING_BIT_TESS_LEVEL_OUTER) + radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_OUTER)); + if (nir_io_mask & VARYING_BIT_TESS_LEVEL_INNER) + radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_INNER)); + + return radv_io_mask; +} + static void gather_shader_info_vs(struct radv_device *device, const nir_shader *nir, const struct radv_graphics_state_key *gfx_state, const struct radv_shader_stage_key *stage_key, @@ -538,16 +555,20 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir, if (!info->inputs_linked) info->tcs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read)); - if (!info->outputs_linked) + if (!info->outputs_linked) { info->tcs.num_linked_outputs = util_last_bit64(radv_gather_unlinked_io_mask( nir->info.outputs_written & ~(VARYING_BIT_TESS_LEVEL_OUTER | VARYING_BIT_TESS_LEVEL_INNER))); + info->tcs.num_linked_patch_outputs = util_last_bit64( + radv_gather_unlinked_patch_io_mask(nir->info.outputs_written, nir->info.patch_outputs_written)); + } if (gfx_state->ts.patch_control_points) { /* Number of tessellation patches per workgroup processed by the current pipeline. */ info->num_tess_patches = get_tcs_num_patches( gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, - info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, pdev->hs.tess_offchip_block_dw_size, - pdev->info.gfx_level, pdev->info.family); + info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs, + info->tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level, + pdev->info.family); /* LDS size used by VS+TCS for storing TCS inputs and outputs. */ info->tcs.num_lds_blocks = diff --git a/src/amd/vulkan/radv_shader_info.h b/src/amd/vulkan/radv_shader_info.h index 425b50254d101..db295d0442ac5 100644 --- a/src/amd/vulkan/radv_shader_info.h +++ b/src/amd/vulkan/radv_shader_info.h @@ -232,6 +232,7 @@ struct radv_shader_info { uint32_t num_lds_blocks; uint8_t num_linked_inputs; /* Number of reserved per-vertex input slots in LDS. */ uint8_t num_linked_outputs; /* Number of reserved per-vertex output slots in VRAM. */ + uint8_t num_linked_patch_outputs; /* Number of reserved per-patch output slots in VRAM. */ uint8_t num_lds_per_vertex_outputs; /* Number of reserved per-vertex output slots in LDS. */ uint8_t num_lds_per_patch_outputs; /* Number of reserved per-patch output slots in LDS. */ bool tes_reads_tess_factors : 1; @@ -267,4 +268,6 @@ enum ac_hw_stage radv_select_hw_stage(const struct radv_shader_info *const info, uint64_t radv_gather_unlinked_io_mask(const uint64_t nir_mask); +uint64_t radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask); + #endif /* RADV_SHADER_INFO_H */