radv: Implement mesh shader scratch ring.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16737>
This commit is contained in:
Timur Kristóf 2022-05-20 18:12:36 +02:00 committed by Marge Bot
parent 6056583ae1
commit 0280b526d5
7 changed files with 91 additions and 9 deletions

View File

@ -509,6 +509,7 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->gsvs_ring_size_needed = 0;
cmd_buffer->tess_rings_needed = false;
cmd_buffer->task_rings_needed = false;
cmd_buffer->mesh_scratch_ring_needed = false;
cmd_buffer->gds_needed = false;
cmd_buffer->gds_oa_needed = false;
cmd_buffer->sample_positions_needed = false;
@ -5260,6 +5261,9 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
cmd_buffer->tess_rings_needed = true;
if (mesh_shading)
cmd_buffer->mesh_scratch_ring_needed |=
pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
cmd_buffer->task_rings_needed = true;
@ -5801,6 +5805,8 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
primary->tess_rings_needed = true;
if (secondary->task_rings_needed)
primary->task_rings_needed = true;
if (secondary->mesh_scratch_ring_needed)
primary->mesh_scratch_ring_needed = true;
if (secondary->sample_positions_needed)
primary->sample_positions_needed = true;
if (secondary->gds_needed)

View File

@ -76,7 +76,8 @@
#define RING_HS_TESS_OFFCHIP 6
#define RING_TS_DRAW 7
#define RING_TS_PAYLOAD 8
#define RING_PS_SAMPLE_POSITIONS 9
#define RING_MS_SCRATCH 9
#define RING_PS_SAMPLE_POSITIONS 10
/* max number of descriptor sets */
#define MAX_SETS 32
@ -91,6 +92,28 @@
*/
#define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull
/* Number of entries in the mesh shader scratch ring.
* This depends on VGT_GS_MAX_WAVE_ID which is set by the kernel
* and is impossible to query. We leave it on its maximum value
* because real applications are unlikely to use it.
*
* The maximum ID on GFX10.3 is 2047 (0x7ff), so we need 2048 entries.
*/
#define RADV_MESH_SCRATCH_NUM_ENTRIES 2048
/* Size of each entry in the mesh shader scratch ring.
* We must ensure that the absolute maximum mesh shader output fits here.
*
* Mesh shaders can create up to 256 vertices/primitives per workgroup,
* and up to the following amount of outputs:
* - 32 parameters
* - 4 positions (clip/cull distance, etc.)
* - 4 per-primitive built-in outputs (layer, view index, prim id, VRS rate)
* - primitive indices which are always kept in LDS
* That is a total of 32+4+4=40 output slots x 16 bytes per slot x 256 = 160K bytes.
*/
#define RADV_MESH_SCRATCH_ENTRY_BYTES (160 * 1024)
/* Number of invocations in each subgroup. */
#define RADV_SUBGROUP_SIZE 64

View File

@ -3605,7 +3605,8 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl
uint32_t esgs_ring_size, struct radeon_winsys_bo *esgs_ring_bo,
uint32_t gsvs_ring_size, struct radeon_winsys_bo *gsvs_ring_bo,
struct radeon_winsys_bo *tess_rings_bo,
struct radeon_winsys_bo *task_rings_bo)
struct radeon_winsys_bo *task_rings_bo,
struct radeon_winsys_bo *mesh_scratch_ring_bo)
{
uint32_t *desc = &map[4];
@ -3791,6 +3792,27 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl
desc += 8;
if (mesh_scratch_ring_bo) {
uint64_t va = radv_buffer_get_va(mesh_scratch_ring_bo);
desc[0] = va;
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
desc[2] = RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
if (device->physical_device->rad_info.gfx_level >= GFX11) {
desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_UINT) |
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
} else {
assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
}
}
desc += 4;
if (add_sample_positions) {
/* add sample positions after all rings */
memcpy(desc, device->sample_locations_1x, 8);
@ -4083,6 +4105,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
struct radeon_winsys_bo *gsvs_ring_bo = queue->gsvs_ring_bo;
struct radeon_winsys_bo *tess_rings_bo = queue->tess_rings_bo;
struct radeon_winsys_bo *task_rings_bo = queue->task_rings_bo;
struct radeon_winsys_bo *mesh_scratch_ring_bo = queue->mesh_scratch_ring_bo;
struct radeon_winsys_bo *gds_bo = queue->gds_bo;
struct radeon_winsys_bo *gds_oa_bo = queue->gds_oa_bo;
struct radeon_cmdbuf *dest_cs[3] = {0};
@ -4154,6 +4177,16 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
goto fail;
}
if (!queue->ring_info.mesh_scratch_ring && needs->mesh_scratch_ring) {
assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
result =
ws->buffer_create(ws, RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES, 256,
RADEON_DOMAIN_VRAM, ring_bo_flags, RADV_BO_PRIORITY_SCRATCH, 0, &mesh_scratch_ring_bo);
if (result != VK_SUCCESS)
goto fail;
}
if (!queue->ring_info.gds && needs->gds) {
assert(device->physical_device->rad_info.gfx_level >= GFX10);
@ -4184,10 +4217,11 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
if ((queue->qf == RADV_QUEUE_COMPUTE && !descriptor_bo && task_rings_bo) ||
scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo ||
gsvs_ring_bo != queue->gsvs_ring_bo || tess_rings_bo != queue->tess_rings_bo ||
task_rings_bo != queue->task_rings_bo || add_sample_positions) {
task_rings_bo != queue->task_rings_bo || mesh_scratch_ring_bo != queue->mesh_scratch_ring_bo ||
add_sample_positions) {
uint32_t size = 0;
if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions) {
size = 144; /* 2 dword + 2 padding + 4 dword * 8 */
if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions) {
size = 160; /* 2 dword + 2 padding + 4 dword * 9 */
if (add_sample_positions)
size += 128; /* 64+32+16+8 = 120 bytes */
} else if (scratch_bo) {
@ -4220,10 +4254,10 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
map[1] = rsrc1;
}
if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions)
if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions)
radv_fill_shader_rings(device, map, add_sample_positions, needs->esgs_ring_size,
esgs_ring_bo, needs->gsvs_ring_size, gsvs_ring_bo, tess_rings_bo,
task_rings_bo);
task_rings_bo, mesh_scratch_ring_bo);
ws->buffer_unmap(descriptor_bo);
}
@ -4238,7 +4272,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
/* Continue preamble is unnecessary when no shader rings are used. */
if (!needs->scratch_size_per_wave && !needs->compute_scratch_size_per_wave &&
!needs->esgs_ring_size && !needs->gsvs_ring_size && !needs->tess_rings &&
!needs->task_rings && !needs->gds && !needs->gds_oa && !needs->sample_positions)
!needs->task_rings && !needs->mesh_scratch_ring && !needs->gds && !needs->gds_oa && !needs->sample_positions)
continue;
}
@ -4368,6 +4402,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
queue->tess_rings_bo = tess_rings_bo;
queue->task_rings_bo = task_rings_bo;
queue->mesh_scratch_ring_bo = mesh_scratch_ring_bo;
queue->gds_bo = gds_bo;
queue->gds_oa_bo = gds_oa_bo;
queue->ring_info = *needs;
@ -4539,6 +4574,7 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
needs.gsvs_ring_size = MAX2(needs.gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
needs.tess_rings |= cmd_buffer->tess_rings_needed;
needs.task_rings |= cmd_buffer->task_rings_needed;
needs.mesh_scratch_ring |= cmd_buffer->mesh_scratch_ring_needed;
needs.gds |= cmd_buffer->gds_needed;
needs.gds_oa |= cmd_buffer->gds_oa_needed;
needs.sample_positions |= cmd_buffer->sample_positions_needed;
@ -4565,7 +4601,9 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
queue->ring_info.esgs_ring_size == needs.esgs_ring_size &&
queue->ring_info.gsvs_ring_size == needs.gsvs_ring_size &&
queue->ring_info.tess_rings == needs.tess_rings &&
queue->ring_info.task_rings == needs.task_rings && queue->ring_info.gds == needs.gds &&
queue->ring_info.task_rings == needs.task_rings &&
queue->ring_info.mesh_scratch_ring == needs.mesh_scratch_ring &&
queue->ring_info.gds == needs.gds &&
queue->ring_info.gds_oa == needs.gds_oa &&
queue->ring_info.sample_positions == needs.sample_positions)
return VK_SUCCESS;

View File

@ -173,6 +173,14 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
case nir_intrinsic_load_ring_task_payload_amd:
return load_ring(b, RING_TS_PAYLOAD, s);
case nir_intrinsic_load_ring_mesh_scratch_amd:
return load_ring(b, RING_MS_SCRATCH, s);
case nir_intrinsic_load_ring_mesh_scratch_offset_amd:
/* gs_tg_info[0:11] is ordered_wave_id. Multiply by the ring entry size. */
return nir_imul_imm(b, nir_iand_imm(b, ac_nir_load_arg(b, &s->args->ac, s->args->ac.gs_tg_info), 0xfff),
RADV_MESH_SCRATCH_ENTRY_BYTES);
case nir_intrinsic_load_task_ring_entry_amd:
return ac_nir_load_arg(b, &s->args->ac, s->args->ac.task_ring_entry);
@ -230,6 +238,8 @@ filter_abi_instr(const nir_instr *instr,
intrin->intrinsic == nir_intrinsic_load_viewport_y_offset ||
intrin->intrinsic == nir_intrinsic_load_ring_task_draw_amd ||
intrin->intrinsic == nir_intrinsic_load_ring_task_payload_amd ||
intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_amd ||
intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_offset_amd ||
intrin->intrinsic == nir_intrinsic_load_task_ring_entry_amd ||
intrin->intrinsic == nir_intrinsic_load_task_ib_addr ||
intrin->intrinsic == nir_intrinsic_load_task_ib_stride ||

View File

@ -717,6 +717,7 @@ struct radv_queue_ring_info {
uint32_t gsvs_ring_size;
bool tess_rings;
bool task_rings;
bool mesh_scratch_ring;
bool gds;
bool gds_oa;
bool sample_positions;
@ -733,6 +734,7 @@ struct radv_queue_state {
struct radeon_winsys_bo *gsvs_ring_bo;
struct radeon_winsys_bo *tess_rings_bo;
struct radeon_winsys_bo *task_rings_bo;
struct radeon_winsys_bo *mesh_scratch_ring_bo;
struct radeon_winsys_bo *gds_bo;
struct radeon_winsys_bo *gds_oa_bo;
@ -1568,6 +1570,7 @@ struct radv_cmd_buffer {
uint32_t gsvs_ring_size_needed;
bool tess_rings_needed;
bool task_rings_needed;
bool mesh_scratch_ring_needed;
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
bool gds_oa_needed; /* for GFX10 streamout */
bool sample_positions_needed;

View File

@ -1242,6 +1242,7 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_
} else if (nir->info.stage == MESA_SHADER_MESH) {
bool scratch_ring = false;
NIR_PASS_V(nir, ac_nir_lower_ngg_ms, &scratch_ring, info->wave_size, pl_key->has_multiview_view_index);
ngg_stage->info.ms.needs_ms_scratch_ring = scratch_ring;
} else {
unreachable("invalid SW stage passed to radv_lower_ngg");
}

View File

@ -364,6 +364,7 @@ struct radv_shader_info {
struct {
struct radv_vs_output_info outinfo;
enum shader_prim output_prim;
bool needs_ms_scratch_ring;
} ms;
struct radv_streamout_info so;