mirror of https://gitlab.freedesktop.org/mesa/mesa
radv: Calculate VRAM tess patch size independently of LDS size.
We recently made some effort to reduce the LDS use of TCS:
The lowering no longer uses the same output location mapping when
storing TCS outputs to LDS and VRAM. This means that the same
patch will use a different amount of LDS and VRAM.
Therefore, we need to properly calculate the patch size in VRAM
when determining the number of output patches.
Fixes: 0e481a4adc
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28739>
This commit is contained in:
parent
8190a65c78
commit
2d9e38dbe5
|
@ -2649,7 +2649,8 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
|
|||
cmd_buffer->state.tess_num_patches = get_tcs_num_patches(
|
||||
d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
|
||||
tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
|
||||
pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level, pdev->info.family);
|
||||
tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
|
||||
pdev->info.gfx_level, pdev->info.family);
|
||||
|
||||
/* Compute the LDS size. */
|
||||
cmd_buffer->state.tess_lds_size =
|
||||
|
|
|
@ -1425,7 +1425,16 @@ radv_link_tcs(const struct radv_device *device, struct radv_shader_stage *tcs_st
|
|||
const uint64_t io_mask = radv_gather_unlinked_io_mask(nir_mask);
|
||||
const unsigned num_reserved_outputs = util_last_bit64(io_mask);
|
||||
|
||||
/* Count the number of per-patch output slots we need to reserve for the TCS and TES.
|
||||
* This is necessary because we need it to determine the patch size in VRAM.
|
||||
*/
|
||||
const uint64_t patch_io_mask = radv_gather_unlinked_patch_io_mask(
|
||||
tcs_stage->nir->info.outputs_written & tes_stage->nir->info.inputs_read,
|
||||
tcs_stage->nir->info.patch_outputs_written & tes_stage->nir->info.patch_inputs_read);
|
||||
const unsigned num_reserved_patch_outputs = util_last_bit64(patch_io_mask);
|
||||
|
||||
tcs_stage->info.tcs.num_linked_outputs = num_reserved_outputs;
|
||||
tcs_stage->info.tcs.num_linked_patch_outputs = num_reserved_patch_outputs;
|
||||
tcs_stage->info.outputs_linked = true;
|
||||
|
||||
tes_stage->info.tes.num_linked_inputs = num_reserved_outputs;
|
||||
|
|
|
@ -672,8 +672,9 @@ calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_ver
|
|||
|
||||
static inline unsigned
|
||||
get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
|
||||
unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs,
|
||||
unsigned tess_offchip_block_dw_size, enum amd_gfx_level gfx_level, enum radeon_family family)
|
||||
unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs,
|
||||
unsigned tcs_num_vram_patch_outputs, unsigned tess_offchip_block_dw_size,
|
||||
enum amd_gfx_level gfx_level, enum radeon_family family)
|
||||
{
|
||||
uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
|
||||
uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
|
||||
|
@ -681,6 +682,10 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
|
|||
uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size;
|
||||
uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16;
|
||||
|
||||
uint32_t vram_output_vertex_size = tcs_num_vram_outputs * 16;
|
||||
uint32_t vram_pervertex_output_patch_size = tcs_num_output_vertices * vram_output_vertex_size;
|
||||
uint32_t vram_output_patch_size = vram_pervertex_output_patch_size + tcs_num_vram_patch_outputs * 16;
|
||||
|
||||
/* Ensure that we only need one wave per SIMD so we don't need to check
|
||||
* resource usage. Also ensures that the number of tcs in and out
|
||||
* vertices per threadgroup are at most 256.
|
||||
|
@ -702,8 +707,8 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
|
|||
if (input_patch_size + lds_output_patch_size)
|
||||
num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + lds_output_patch_size));
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
if (lds_output_patch_size)
|
||||
num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / lds_output_patch_size);
|
||||
if (vram_output_patch_size)
|
||||
num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_output_patch_size);
|
||||
/* Not necessary for correctness, but improves performance. The
|
||||
* specific value is taken from the proprietary driver.
|
||||
*/
|
||||
|
|
|
@ -471,6 +471,23 @@ radv_gather_unlinked_io_mask(const uint64_t nir_io_mask)
|
|||
return radv_io_mask;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask)
|
||||
{
|
||||
uint64_t radv_io_mask = 0;
|
||||
u_foreach_bit64 (semantic, nir_patch_io_mask) {
|
||||
radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(semantic + VARYING_SLOT_PATCH0));
|
||||
}
|
||||
|
||||
/* Tess levels need to be handled separately because they are not part of patch_outputs_written. */
|
||||
if (nir_io_mask & VARYING_BIT_TESS_LEVEL_OUTER)
|
||||
radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_OUTER));
|
||||
if (nir_io_mask & VARYING_BIT_TESS_LEVEL_INNER)
|
||||
radv_io_mask |= BITFIELD64_BIT(radv_map_io_driver_location(VARYING_SLOT_TESS_LEVEL_INNER));
|
||||
|
||||
return radv_io_mask;
|
||||
}
|
||||
|
||||
static void
|
||||
gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
|
||||
const struct radv_graphics_state_key *gfx_state, const struct radv_shader_stage_key *stage_key,
|
||||
|
@ -538,16 +555,20 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,
|
|||
|
||||
if (!info->inputs_linked)
|
||||
info->tcs.num_linked_inputs = util_last_bit64(radv_gather_unlinked_io_mask(nir->info.inputs_read));
|
||||
if (!info->outputs_linked)
|
||||
if (!info->outputs_linked) {
|
||||
info->tcs.num_linked_outputs = util_last_bit64(radv_gather_unlinked_io_mask(
|
||||
nir->info.outputs_written & ~(VARYING_BIT_TESS_LEVEL_OUTER | VARYING_BIT_TESS_LEVEL_INNER)));
|
||||
info->tcs.num_linked_patch_outputs = util_last_bit64(
|
||||
radv_gather_unlinked_patch_io_mask(nir->info.outputs_written, nir->info.patch_outputs_written));
|
||||
}
|
||||
|
||||
if (gfx_state->ts.patch_control_points) {
|
||||
/* Number of tessellation patches per workgroup processed by the current pipeline. */
|
||||
info->num_tess_patches = get_tcs_num_patches(
|
||||
gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
|
||||
info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
|
||||
pdev->info.gfx_level, pdev->info.family);
|
||||
info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs,
|
||||
info->tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level,
|
||||
pdev->info.family);
|
||||
|
||||
/* LDS size used by VS+TCS for storing TCS inputs and outputs. */
|
||||
info->tcs.num_lds_blocks =
|
||||
|
|
|
@ -232,6 +232,7 @@ struct radv_shader_info {
|
|||
uint32_t num_lds_blocks;
|
||||
uint8_t num_linked_inputs; /* Number of reserved per-vertex input slots in LDS. */
|
||||
uint8_t num_linked_outputs; /* Number of reserved per-vertex output slots in VRAM. */
|
||||
uint8_t num_linked_patch_outputs; /* Number of reserved per-patch output slots in VRAM. */
|
||||
uint8_t num_lds_per_vertex_outputs; /* Number of reserved per-vertex output slots in LDS. */
|
||||
uint8_t num_lds_per_patch_outputs; /* Number of reserved per-patch output slots in LDS. */
|
||||
bool tes_reads_tess_factors : 1;
|
||||
|
@ -267,4 +268,6 @@ enum ac_hw_stage radv_select_hw_stage(const struct radv_shader_info *const info,
|
|||
|
||||
uint64_t radv_gather_unlinked_io_mask(const uint64_t nir_mask);
|
||||
|
||||
uint64_t radv_gather_unlinked_patch_io_mask(const uint64_t nir_io_mask, const uint32_t nir_patch_io_mask);
|
||||
|
||||
#endif /* RADV_SHADER_INFO_H */
|
||||
|
|
Loading…
Reference in New Issue