radv: Implement mesh shader scratch ring.
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16737>
This commit is contained in:
parent
6056583ae1
commit
0280b526d5
|
@ -509,6 +509,7 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
|
|||
cmd_buffer->gsvs_ring_size_needed = 0;
|
||||
cmd_buffer->tess_rings_needed = false;
|
||||
cmd_buffer->task_rings_needed = false;
|
||||
cmd_buffer->mesh_scratch_ring_needed = false;
|
||||
cmd_buffer->gds_needed = false;
|
||||
cmd_buffer->gds_oa_needed = false;
|
||||
cmd_buffer->sample_positions_needed = false;
|
||||
|
@ -5260,6 +5261,9 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
|
|||
|
||||
if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL))
|
||||
cmd_buffer->tess_rings_needed = true;
|
||||
if (mesh_shading)
|
||||
cmd_buffer->mesh_scratch_ring_needed |=
|
||||
pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring;
|
||||
|
||||
if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) {
|
||||
cmd_buffer->task_rings_needed = true;
|
||||
|
@ -5801,6 +5805,8 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
|
|||
primary->tess_rings_needed = true;
|
||||
if (secondary->task_rings_needed)
|
||||
primary->task_rings_needed = true;
|
||||
if (secondary->mesh_scratch_ring_needed)
|
||||
primary->mesh_scratch_ring_needed = true;
|
||||
if (secondary->sample_positions_needed)
|
||||
primary->sample_positions_needed = true;
|
||||
if (secondary->gds_needed)
|
||||
|
|
|
@ -76,7 +76,8 @@
|
|||
#define RING_HS_TESS_OFFCHIP 6
|
||||
#define RING_TS_DRAW 7
|
||||
#define RING_TS_PAYLOAD 8
|
||||
#define RING_PS_SAMPLE_POSITIONS 9
|
||||
#define RING_MS_SCRATCH 9
|
||||
#define RING_PS_SAMPLE_POSITIONS 10
|
||||
|
||||
/* max number of descriptor sets */
|
||||
#define MAX_SETS 32
|
||||
|
@ -91,6 +92,28 @@
|
|||
*/
|
||||
#define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull
|
||||
|
||||
/* Number of entries in the mesh shader scratch ring.
|
||||
* This depends on VGT_GS_MAX_WAVE_ID which is set by the kernel
|
||||
* and is impossible to query. We leave it on its maximum value
|
||||
* because real applications are unlikely to use it.
|
||||
*
|
||||
* The maximum ID on GFX10.3 is 2047 (0x7ff), so we need 2048 entries.
|
||||
*/
|
||||
#define RADV_MESH_SCRATCH_NUM_ENTRIES 2048
|
||||
|
||||
/* Size of each entry in the mesh shader scratch ring.
|
||||
* We must ensure that the absolute maximum mesh shader output fits here.
|
||||
*
|
||||
* Mesh shaders can create up to 256 vertices/primitives per workgroup,
|
||||
* and up to the following amount of outputs:
|
||||
* - 32 parameters
|
||||
* - 4 positions (clip/cull distance, etc.)
|
||||
* - 4 per-primitive built-in outputs (layer, view index, prim id, VRS rate)
|
||||
* - primitive indices which are always kept in LDS
|
||||
* That is a total of 32+4+4=40 output slots x 16 bytes per slot x 256 = 160K bytes.
|
||||
*/
|
||||
#define RADV_MESH_SCRATCH_ENTRY_BYTES (160 * 1024)
|
||||
|
||||
/* Number of invocations in each subgroup. */
|
||||
#define RADV_SUBGROUP_SIZE 64
|
||||
|
||||
|
|
|
@ -3605,7 +3605,8 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl
|
|||
uint32_t esgs_ring_size, struct radeon_winsys_bo *esgs_ring_bo,
|
||||
uint32_t gsvs_ring_size, struct radeon_winsys_bo *gsvs_ring_bo,
|
||||
struct radeon_winsys_bo *tess_rings_bo,
|
||||
struct radeon_winsys_bo *task_rings_bo)
|
||||
struct radeon_winsys_bo *task_rings_bo,
|
||||
struct radeon_winsys_bo *mesh_scratch_ring_bo)
|
||||
{
|
||||
uint32_t *desc = &map[4];
|
||||
|
||||
|
@ -3791,6 +3792,27 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl
|
|||
|
||||
desc += 8;
|
||||
|
||||
if (mesh_scratch_ring_bo) {
|
||||
uint64_t va = radv_buffer_get_va(mesh_scratch_ring_bo);
|
||||
|
||||
desc[0] = va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
|
||||
desc[2] = RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES;
|
||||
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
||||
|
||||
if (device->physical_device->rad_info.gfx_level >= GFX11) {
|
||||
desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_UINT) |
|
||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED);
|
||||
} else {
|
||||
assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
|
||||
desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
|
||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
|
||||
}
|
||||
}
|
||||
|
||||
desc += 4;
|
||||
|
||||
if (add_sample_positions) {
|
||||
/* add sample positions after all rings */
|
||||
memcpy(desc, device->sample_locations_1x, 8);
|
||||
|
@ -4083,6 +4105,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
|
|||
struct radeon_winsys_bo *gsvs_ring_bo = queue->gsvs_ring_bo;
|
||||
struct radeon_winsys_bo *tess_rings_bo = queue->tess_rings_bo;
|
||||
struct radeon_winsys_bo *task_rings_bo = queue->task_rings_bo;
|
||||
struct radeon_winsys_bo *mesh_scratch_ring_bo = queue->mesh_scratch_ring_bo;
|
||||
struct radeon_winsys_bo *gds_bo = queue->gds_bo;
|
||||
struct radeon_winsys_bo *gds_oa_bo = queue->gds_oa_bo;
|
||||
struct radeon_cmdbuf *dest_cs[3] = {0};
|
||||
|
@ -4154,6 +4177,16 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
|
|||
goto fail;
|
||||
}
|
||||
|
||||
if (!queue->ring_info.mesh_scratch_ring && needs->mesh_scratch_ring) {
|
||||
assert(device->physical_device->rad_info.gfx_level >= GFX10_3);
|
||||
result =
|
||||
ws->buffer_create(ws, RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES, 256,
|
||||
RADEON_DOMAIN_VRAM, ring_bo_flags, RADV_BO_PRIORITY_SCRATCH, 0, &mesh_scratch_ring_bo);
|
||||
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (!queue->ring_info.gds && needs->gds) {
|
||||
assert(device->physical_device->rad_info.gfx_level >= GFX10);
|
||||
|
||||
|
@ -4184,10 +4217,11 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
|
|||
if ((queue->qf == RADV_QUEUE_COMPUTE && !descriptor_bo && task_rings_bo) ||
|
||||
scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo ||
|
||||
gsvs_ring_bo != queue->gsvs_ring_bo || tess_rings_bo != queue->tess_rings_bo ||
|
||||
task_rings_bo != queue->task_rings_bo || add_sample_positions) {
|
||||
task_rings_bo != queue->task_rings_bo || mesh_scratch_ring_bo != queue->mesh_scratch_ring_bo ||
|
||||
add_sample_positions) {
|
||||
uint32_t size = 0;
|
||||
if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions) {
|
||||
size = 144; /* 2 dword + 2 padding + 4 dword * 8 */
|
||||
if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions) {
|
||||
size = 160; /* 2 dword + 2 padding + 4 dword * 9 */
|
||||
if (add_sample_positions)
|
||||
size += 128; /* 64+32+16+8 = 120 bytes */
|
||||
} else if (scratch_bo) {
|
||||
|
@ -4220,10 +4254,10 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
|
|||
map[1] = rsrc1;
|
||||
}
|
||||
|
||||
if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions)
|
||||
if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions)
|
||||
radv_fill_shader_rings(device, map, add_sample_positions, needs->esgs_ring_size,
|
||||
esgs_ring_bo, needs->gsvs_ring_size, gsvs_ring_bo, tess_rings_bo,
|
||||
task_rings_bo);
|
||||
task_rings_bo, mesh_scratch_ring_bo);
|
||||
|
||||
ws->buffer_unmap(descriptor_bo);
|
||||
}
|
||||
|
@ -4238,7 +4272,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
|
|||
/* Continue preamble is unnecessary when no shader rings are used. */
|
||||
if (!needs->scratch_size_per_wave && !needs->compute_scratch_size_per_wave &&
|
||||
!needs->esgs_ring_size && !needs->gsvs_ring_size && !needs->tess_rings &&
|
||||
!needs->task_rings && !needs->gds && !needs->gds_oa && !needs->sample_positions)
|
||||
!needs->task_rings && !needs->mesh_scratch_ring && !needs->gds && !needs->gds_oa && !needs->sample_positions)
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -4368,6 +4402,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi
|
|||
|
||||
queue->tess_rings_bo = tess_rings_bo;
|
||||
queue->task_rings_bo = task_rings_bo;
|
||||
queue->mesh_scratch_ring_bo = mesh_scratch_ring_bo;
|
||||
queue->gds_bo = gds_bo;
|
||||
queue->gds_oa_bo = gds_oa_bo;
|
||||
queue->ring_info = *needs;
|
||||
|
@ -4539,6 +4574,7 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
|
|||
needs.gsvs_ring_size = MAX2(needs.gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
|
||||
needs.tess_rings |= cmd_buffer->tess_rings_needed;
|
||||
needs.task_rings |= cmd_buffer->task_rings_needed;
|
||||
needs.mesh_scratch_ring |= cmd_buffer->mesh_scratch_ring_needed;
|
||||
needs.gds |= cmd_buffer->gds_needed;
|
||||
needs.gds_oa |= cmd_buffer->gds_oa_needed;
|
||||
needs.sample_positions |= cmd_buffer->sample_positions_needed;
|
||||
|
@ -4565,7 +4601,9 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
|
|||
queue->ring_info.esgs_ring_size == needs.esgs_ring_size &&
|
||||
queue->ring_info.gsvs_ring_size == needs.gsvs_ring_size &&
|
||||
queue->ring_info.tess_rings == needs.tess_rings &&
|
||||
queue->ring_info.task_rings == needs.task_rings && queue->ring_info.gds == needs.gds &&
|
||||
queue->ring_info.task_rings == needs.task_rings &&
|
||||
queue->ring_info.mesh_scratch_ring == needs.mesh_scratch_ring &&
|
||||
queue->ring_info.gds == needs.gds &&
|
||||
queue->ring_info.gds_oa == needs.gds_oa &&
|
||||
queue->ring_info.sample_positions == needs.sample_positions)
|
||||
return VK_SUCCESS;
|
||||
|
|
|
@ -173,6 +173,14 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
|
|||
case nir_intrinsic_load_ring_task_payload_amd:
|
||||
return load_ring(b, RING_TS_PAYLOAD, s);
|
||||
|
||||
case nir_intrinsic_load_ring_mesh_scratch_amd:
|
||||
return load_ring(b, RING_MS_SCRATCH, s);
|
||||
|
||||
case nir_intrinsic_load_ring_mesh_scratch_offset_amd:
|
||||
/* gs_tg_info[0:11] is ordered_wave_id. Multiply by the ring entry size. */
|
||||
return nir_imul_imm(b, nir_iand_imm(b, ac_nir_load_arg(b, &s->args->ac, s->args->ac.gs_tg_info), 0xfff),
|
||||
RADV_MESH_SCRATCH_ENTRY_BYTES);
|
||||
|
||||
case nir_intrinsic_load_task_ring_entry_amd:
|
||||
return ac_nir_load_arg(b, &s->args->ac, s->args->ac.task_ring_entry);
|
||||
|
||||
|
@ -230,6 +238,8 @@ filter_abi_instr(const nir_instr *instr,
|
|||
intrin->intrinsic == nir_intrinsic_load_viewport_y_offset ||
|
||||
intrin->intrinsic == nir_intrinsic_load_ring_task_draw_amd ||
|
||||
intrin->intrinsic == nir_intrinsic_load_ring_task_payload_amd ||
|
||||
intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_amd ||
|
||||
intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_offset_amd ||
|
||||
intrin->intrinsic == nir_intrinsic_load_task_ring_entry_amd ||
|
||||
intrin->intrinsic == nir_intrinsic_load_task_ib_addr ||
|
||||
intrin->intrinsic == nir_intrinsic_load_task_ib_stride ||
|
||||
|
|
|
@ -717,6 +717,7 @@ struct radv_queue_ring_info {
|
|||
uint32_t gsvs_ring_size;
|
||||
bool tess_rings;
|
||||
bool task_rings;
|
||||
bool mesh_scratch_ring;
|
||||
bool gds;
|
||||
bool gds_oa;
|
||||
bool sample_positions;
|
||||
|
@ -733,6 +734,7 @@ struct radv_queue_state {
|
|||
struct radeon_winsys_bo *gsvs_ring_bo;
|
||||
struct radeon_winsys_bo *tess_rings_bo;
|
||||
struct radeon_winsys_bo *task_rings_bo;
|
||||
struct radeon_winsys_bo *mesh_scratch_ring_bo;
|
||||
struct radeon_winsys_bo *gds_bo;
|
||||
struct radeon_winsys_bo *gds_oa_bo;
|
||||
|
||||
|
@ -1568,6 +1570,7 @@ struct radv_cmd_buffer {
|
|||
uint32_t gsvs_ring_size_needed;
|
||||
bool tess_rings_needed;
|
||||
bool task_rings_needed;
|
||||
bool mesh_scratch_ring_needed;
|
||||
bool gds_needed; /* for GFX10 streamout and NGG GS queries */
|
||||
bool gds_oa_needed; /* for GFX10 streamout */
|
||||
bool sample_positions_needed;
|
||||
|
|
|
@ -1242,6 +1242,7 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_
|
|||
} else if (nir->info.stage == MESA_SHADER_MESH) {
|
||||
bool scratch_ring = false;
|
||||
NIR_PASS_V(nir, ac_nir_lower_ngg_ms, &scratch_ring, info->wave_size, pl_key->has_multiview_view_index);
|
||||
ngg_stage->info.ms.needs_ms_scratch_ring = scratch_ring;
|
||||
} else {
|
||||
unreachable("invalid SW stage passed to radv_lower_ngg");
|
||||
}
|
||||
|
|
|
@ -364,6 +364,7 @@ struct radv_shader_info {
|
|||
struct {
|
||||
struct radv_vs_output_info outinfo;
|
||||
enum shader_prim output_prim;
|
||||
bool needs_ms_scratch_ring;
|
||||
} ms;
|
||||
|
||||
struct radv_streamout_info so;
|
||||
|
|
Loading…
Reference in New Issue