radv: Add task shader arguments.

Mostly the same as for compute shaders, but with a few extras:

task_ring_offsets:
Same as what ring_offsets is to graphics shaders.
Contains an address that points to a buffer that contains
the ring buffer descriptors.

task_ring_entry:
Index that can be used to address the draw and payload rings.

draw_id:
Same meaning as in graphics shaders.

task_ib_addr/task_ib_stride:
Indirect buffer address and stride from the draw calls.
These are used to emulate the firstTask feature of NV_mesh_shader.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14929>
This commit is contained in:
Timur Kristóf 2022-01-23 18:35:12 +01:00 committed by Marge Bot
parent 101a7321c4
commit b3ea6c6103
5 changed files with 60 additions and 5 deletions

View File

@ -74,7 +74,9 @@
#define RING_GSVS_GS 4
#define RING_HS_TESS_FACTOR 5
#define RING_HS_TESS_OFFCHIP 6
#define RING_PS_SAMPLE_POSITIONS 7
#define RING_TS_DRAW 7
#define RING_TS_PAYLOAD 8
#define RING_PS_SAMPLE_POSITIONS 9
/* max number of descriptor sets */
#define MAX_SETS 32

View File

@ -3707,6 +3707,10 @@ radv_fill_shader_rings(struct radv_queue *queue, uint32_t *map, bool add_sample_
desc += 8;
/* Reserved for task shader rings. */
desc += 8;
if (add_sample_positions) {
/* add sample positions after all rings */
memcpy(desc, queue->device->sample_locations_1x, 8);
@ -4004,7 +4008,7 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
add_sample_positions) {
uint32_t size = 0;
if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || add_sample_positions) {
size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
size = 144; /* 2 dword + 2 padding + 4 dword * 8 */
if (add_sample_positions)
size += 128; /* 64+32+16+8 = 120 bytes */
} else if (scratch_bo) {

View File

@ -146,7 +146,8 @@ enum radv_ud_index {
AC_UD_NGG_CULLING_SETTINGS = 7,
AC_UD_NGG_VIEWPORT = 8,
AC_UD_FORCE_VRS_RATES = 9,
AC_UD_SHADER_START = 10,
AC_UD_TASK_RING_ENTRY = 10,
AC_UD_SHADER_START = 11,
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
AC_UD_VS_PROLOG_INPUTS,
@ -155,6 +156,9 @@ enum radv_ud_index {
AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
AC_UD_CS_SBT_DESCRIPTORS,
AC_UD_CS_RAY_LAUNCH_SIZE,
AC_UD_CS_TASK_RING_OFFSETS,
AC_UD_CS_TASK_DRAW_ID,
AC_UD_CS_TASK_IB,
AC_UD_CS_MAX_UD,
AC_UD_GS_MAX_UD,
AC_UD_TCS_MAX_UD,

View File

@ -50,7 +50,8 @@ set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx, uint8_
static void
set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
{
bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS &&
idx != AC_UD_CS_TASK_RING_OFFSETS;
set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
}
@ -157,18 +158,26 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i
/* 2 user sgprs will always be allocated for scratch/rings */
user_sgpr_count += 2;
if (stage == MESA_SHADER_TASK)
user_sgpr_count += 2; /* task descriptors */
/* prolog inputs */
if (info->vs.has_prolog)
user_sgpr_count += 2;
switch (stage) {
case MESA_SHADER_COMPUTE:
case MESA_SHADER_TASK:
if (info->cs.uses_sbt)
user_sgpr_count += 1;
if (info->cs.uses_grid_size)
user_sgpr_count += args->load_grid_size_from_user_sgpr ? 3 : 2;
if (info->cs.uses_ray_launch_size)
user_sgpr_count += 3;
if (info->vs.needs_draw_id)
user_sgpr_count += 1;
if (info->cs.uses_task_rings)
user_sgpr_count += 4; /* ring_entry, 2x ib_addr, ib_stride */
break;
case MESA_SHADER_FRAGMENT:
break;
@ -212,7 +221,8 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i
if (info->so.num_outputs)
user_sgpr_count++;
uint32_t available_sgprs = chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16;
uint32_t available_sgprs =
chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_TASK ? 32 : 16;
uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
uint32_t num_desc_set = util_bitcount(info->desc_set_used_mask);
@ -527,6 +537,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
if (args->explicit_scratch_args) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ring_offsets);
}
if (stage == MESA_SHADER_TASK) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->task_ring_offsets);
}
/* To ensure prologs match the main VS, VS specific input SGPRs have to be placed before other
* sgprs.
@ -534,6 +547,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
switch (stage) {
case MESA_SHADER_COMPUTE:
case MESA_SHADER_TASK:
declare_global_input_sgprs(info, &user_sgpr_info, args);
if (info->cs.uses_sbt) {
@ -551,6 +565,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.ray_launch_size);
}
if (info->vs.needs_draw_id) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
}
if (info->cs.uses_task_rings) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry);
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->task_ib_addr);
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->task_ib_stride);
}
for (int i = 0; i < 3; i++) {
if (info->cs.uses_block_id[i]) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
@ -750,6 +774,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
uint8_t user_sgpr_idx = 0;
set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx);
if (stage == MESA_SHADER_TASK) {
set_loc_shader_ptr(args, AC_UD_CS_TASK_RING_OFFSETS, &user_sgpr_idx);
}
/* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
* the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
@ -765,6 +792,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
switch (stage) {
case MESA_SHADER_COMPUTE:
case MESA_SHADER_TASK:
if (args->ac.sbt_descriptors.used) {
set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx);
}
@ -775,6 +803,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
if (args->ac.ray_launch_size.used) {
set_loc_shader(args, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3);
}
if (args->ac.draw_id.used) {
set_loc_shader(args, AC_UD_CS_TASK_DRAW_ID, &user_sgpr_idx, 1);
}
if (args->ac.task_ring_entry.used) {
set_loc_shader(args, AC_UD_TASK_RING_ENTRY, &user_sgpr_idx, 1);
}
if (args->task_ib_addr.used) {
assert(args->task_ib_stride.used);
set_loc_shader(args, AC_UD_CS_TASK_IB, &user_sgpr_idx, 3);
}
break;
case MESA_SHADER_VERTEX:
if (args->ac.view_index.used)

View File

@ -36,7 +36,10 @@ struct radv_shader_args {
struct ac_shader_args ac;
struct ac_arg descriptor_sets[MAX_SETS];
/* User data 0/1. GFX: descriptor list, Compute: scratch BO */
struct ac_arg ring_offsets;
/* User data 2/3. same as the descriptor list above but for task shaders. */
struct ac_arg task_ring_offsets;
/* Streamout */
struct ac_arg streamout_buffers;
@ -47,6 +50,10 @@ struct radv_shader_args {
struct ac_arg ngg_viewport_scale[2];
struct ac_arg ngg_viewport_translate[2];
/* Task shaders */
struct ac_arg task_ib_addr;
struct ac_arg task_ib_stride;
struct ac_arg prolog_inputs;
struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS];