radv: Calculate VRAM tess patch size independently of LDS size.

We recently made some effort to reduce the LDS use of TCS:
The lowering no longer uses the same output location mapping when
storing TCS outputs to LDS and VRAM. This means that the same
patch will use a different amount of LDS and VRAM.

Therefore, we need to properly calculate the patch size in VRAM
when determining the number of output patches.

Fixes: 0e481a4adc
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28739>
This commit is contained in:
Timur Kristóf 2024-04-15 12:31:01 +02:00 committed by Marge Bot
parent 8190a65c78
commit 2d9e38dbe5
5 changed files with 47 additions and 8 deletions

View File

@ -2649,7 +2649,8 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->state.tess_num_patches = get_tcs_num_patches(
d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level, pdev->info.family);
tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
pdev->info.gfx_level, pdev->info.family);
/* Compute the LDS size. */
cmd_buffer->state.tess_lds_size =

View File

@ -1425,7 +1425,16 @@ radv_link_tcs(const struct radv_device *device, struct radv_shader_stage *tcs_st
const uint64_t io_mask = radv_gather_unlinked_io_mask(nir_mask);
const unsigned num_reserved_outputs = util_last_bit64(io_mask);
/* Count the number of per-patch output slots we need to reserve for the TCS and TES.
* This is necessary because we need it to determine the patch size in VRAM.
*/
const uint64_t patch_io_mask = radv_gather_unlinked_patch_io_mask(
tcs_stage->nir->info.outputs_written & tes_stage->nir->info.inputs_read,
tcs_stage->nir->info.patch_outputs_written & tes_stage->nir->info.patch_inputs_read);
const unsigned num_reserved_patch_outputs = util_last_bit64(patch_io_mask);
tcs_stage->info.tcs.num_linked_outputs = num_reserved_outputs;
tcs_stage->info.tcs.num_linked_patch_outputs = num_reserved_patch_outputs;
tcs_stage->info.outputs_linked = true;
tes_stage->info.tes.num_linked_inputs = num_reserved_outputs;

View File

@ -672,8 +672,9 @@ calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_ver
static inline unsigned
get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs,
unsigned tess_offchip_block_dw_size, enum amd_gfx_level gfx_level, enum radeon_family family)
unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs,
unsigned tcs_num_vram_patch_outputs, unsigned tess_offchip_block_dw_size,
enum amd_gfx_level gfx_level, enum radeon_family family)
{
uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
@ -681,6 +682,10 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size;
uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16;
uint32_t vram_output_vertex_size = tcs_num_vram_outputs * 16;
uint32_t vram_pervertex_output_patch_size = tcs_num_output_vertices * vram_output_vertex_size;
uint32_t vram_output_patch_size = vram_pervertex_output_patch_size + tcs_num_vram_patch_outputs * 16;
/* Ensure that we only need one wave per SIMD so we don't need to check
* resource usage. Also ensures that the number of tcs in and out
* vertices per threadgroup are at most 256.
@ -702,8 +707,8 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
if (input_patch_size + lds_output_patch_size)
num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + lds_output_patch_size));
/* Make sure the output data fits in the offchip buffer */
if (lds_output_patch_size)
num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / lds_output_patch_size);
if (vram_output_patch_size)
num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_output_patch_size);
/* Not necessary for correctness, but improves performance. The
* specific value is taken from the proprietary driver.
*/

View File

@ -471,6 +471,23 @@ radv_gather_unlinked_io_mask(const uint64_t nir_io_mask)
return radv_io_mask;
}
uint64_t
radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask)
{
uint64_t radv_io_mask = 0;
u_foreach_bit64 (semantic, nir_patch_io_mask) {
radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(semantic + VARYING_SLOT_PATCH0));
}
/* Tess levels need to be handled separately because they are not part of patch_outputs_written. */
if (nir_io_mask & VARYING_BIT_TESS_LEVEL_OUTER)
radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_OUTER));
if (nir_io_mask & VARYING_BIT_TESS_LEVEL_INNER)
radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_INNER));
return radv_io_mask;
}
static void
gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
const struct radv_graphics_state_key *gfx_state, const struct radv_shader_stage_key *stage_key,
@ -538,16 +555,20 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,
if (!info->inputs_linked)
info->tcs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read));
if (!info->outputs_linked)
if (!info->outputs_linked) {
info->tcs.num_linked_outputs = util_last_bit64(radv_gather_unlinked_io_mask(
nir->info.outputs_written & ~(VARYING_BIT_TESS_LEVEL_OUTER | VARYING_BIT_TESS_LEVEL_INNER)));
info->tcs.num_linked_patch_outputs = util_last_bit64(
radv_gather_unlinked_patch_io_mask(nir->info.outputs_written, nir->info.patch_outputs_written));
}
if (gfx_state->ts.patch_control_points) {
/* Number of tessellation patches per workgroup processed by the current pipeline. */
info->num_tess_patches = get_tcs_num_patches(
gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
pdev->info.gfx_level, pdev->info.family);
info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs,
info->tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level,
pdev->info.family);
/* LDS size used by VS+TCS for storing TCS inputs and outputs. */
info->tcs.num_lds_blocks =

View File

@ -232,6 +232,7 @@ struct radv_shader_info {
uint32_t num_lds_blocks;
uint8_t num_linked_inputs; /* Number of reserved per-vertex input slots in LDS. */
uint8_t num_linked_outputs; /* Number of reserved per-vertex output slots in VRAM. */
uint8_t num_linked_patch_outputs; /* Number of reserved per-patch output slots in VRAM. */
uint8_t num_lds_per_vertex_outputs; /* Number of reserved per-vertex output slots in LDS. */
uint8_t num_lds_per_patch_outputs; /* Number of reserved per-patch output slots in LDS. */
bool tes_reads_tess_factors : 1;
@ -267,4 +268,6 @@ enum ac_hw_stage radv_select_hw_stage(const struct radv_shader_info *const info,
uint64_t radv_gather_unlinked_io_mask(const uint64_t nir_mask);
uint64_t radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask);
#endif /* RADV_SHADER_INFO_H */