radv: Add task shader arguments.
Mostly the same as for compute shaders, but with a few extras: task_ring_offsets: Same as what ring_offsets is to graphics shaders. Contains an address that points to a buffer that contains the ring buffer descriptors. task_ring_entry: Index that can be used to address the draw and payload rings. draw_id: Same meaning as in graphics shaders. task_ib_addr/task_ib_stride: Indirect buffer address and stride from the draw calls. These are used to emulate the firstTask feature of NV_mesh_shader. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14929>
This commit is contained in:
parent
101a7321c4
commit
b3ea6c6103
|
@ -74,7 +74,9 @@
|
|||
#define RING_GSVS_GS 4
|
||||
#define RING_HS_TESS_FACTOR 5
|
||||
#define RING_HS_TESS_OFFCHIP 6
|
||||
#define RING_PS_SAMPLE_POSITIONS 7
|
||||
#define RING_TS_DRAW 7
|
||||
#define RING_TS_PAYLOAD 8
|
||||
#define RING_PS_SAMPLE_POSITIONS 9
|
||||
|
||||
/* max number of descriptor sets */
|
||||
#define MAX_SETS 32
|
||||
|
|
|
@ -3707,6 +3707,10 @@ radv_fill_shader_rings(struct radv_queue *queue, uint32_t *map, bool add_sample_
|
|||
|
||||
desc += 8;
|
||||
|
||||
/* Reserved for task shader rings. */
|
||||
|
||||
desc += 8;
|
||||
|
||||
if (add_sample_positions) {
|
||||
/* add sample positions after all rings */
|
||||
memcpy(desc, queue->device->sample_locations_1x, 8);
|
||||
|
@ -4004,7 +4008,7 @@ radv_update_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave
|
|||
add_sample_positions) {
|
||||
uint32_t size = 0;
|
||||
if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || add_sample_positions) {
|
||||
size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
|
||||
size = 144; /* 2 dword + 2 padding + 4 dword * 8 */
|
||||
if (add_sample_positions)
|
||||
size += 128; /* 64+32+16+8 = 120 bytes */
|
||||
} else if (scratch_bo) {
|
||||
|
|
|
@ -146,7 +146,8 @@ enum radv_ud_index {
|
|||
AC_UD_NGG_CULLING_SETTINGS = 7,
|
||||
AC_UD_NGG_VIEWPORT = 8,
|
||||
AC_UD_FORCE_VRS_RATES = 9,
|
||||
AC_UD_SHADER_START = 10,
|
||||
AC_UD_TASK_RING_ENTRY = 10,
|
||||
AC_UD_SHADER_START = 11,
|
||||
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
|
||||
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
|
||||
AC_UD_VS_PROLOG_INPUTS,
|
||||
|
@ -155,6 +156,9 @@ enum radv_ud_index {
|
|||
AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
|
||||
AC_UD_CS_SBT_DESCRIPTORS,
|
||||
AC_UD_CS_RAY_LAUNCH_SIZE,
|
||||
AC_UD_CS_TASK_RING_OFFSETS,
|
||||
AC_UD_CS_TASK_DRAW_ID,
|
||||
AC_UD_CS_TASK_IB,
|
||||
AC_UD_CS_MAX_UD,
|
||||
AC_UD_GS_MAX_UD,
|
||||
AC_UD_TCS_MAX_UD,
|
||||
|
|
|
@ -50,7 +50,8 @@ set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx, uint8_
|
|||
static void
|
||||
set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
|
||||
{
|
||||
bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
|
||||
bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS &&
|
||||
idx != AC_UD_CS_TASK_RING_OFFSETS;
|
||||
|
||||
set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
|
||||
}
|
||||
|
@ -157,18 +158,26 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i
|
|||
/* 2 user sgprs will always be allocated for scratch/rings */
|
||||
user_sgpr_count += 2;
|
||||
|
||||
if (stage == MESA_SHADER_TASK)
|
||||
user_sgpr_count += 2; /* task descriptors */
|
||||
|
||||
/* prolog inputs */
|
||||
if (info->vs.has_prolog)
|
||||
user_sgpr_count += 2;
|
||||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_COMPUTE:
|
||||
case MESA_SHADER_TASK:
|
||||
if (info->cs.uses_sbt)
|
||||
user_sgpr_count += 1;
|
||||
if (info->cs.uses_grid_size)
|
||||
user_sgpr_count += args->load_grid_size_from_user_sgpr ? 3 : 2;
|
||||
if (info->cs.uses_ray_launch_size)
|
||||
user_sgpr_count += 3;
|
||||
if (info->vs.needs_draw_id)
|
||||
user_sgpr_count += 1;
|
||||
if (info->cs.uses_task_rings)
|
||||
user_sgpr_count += 4; /* ring_entry, 2x ib_addr, ib_stride */
|
||||
break;
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
break;
|
||||
|
@ -212,7 +221,8 @@ allocate_user_sgprs(enum chip_class chip_class, const struct radv_shader_info *i
|
|||
if (info->so.num_outputs)
|
||||
user_sgpr_count++;
|
||||
|
||||
uint32_t available_sgprs = chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16;
|
||||
uint32_t available_sgprs =
|
||||
chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_TASK ? 32 : 16;
|
||||
uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
|
||||
uint32_t num_desc_set = util_bitcount(info->desc_set_used_mask);
|
||||
|
||||
|
@ -527,6 +537,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
|
|||
if (args->explicit_scratch_args) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ring_offsets);
|
||||
}
|
||||
if (stage == MESA_SHADER_TASK) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->task_ring_offsets);
|
||||
}
|
||||
|
||||
/* To ensure prologs match the main VS, VS specific input SGPRs have to be placed before other
|
||||
* sgprs.
|
||||
|
@ -534,6 +547,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
|
|||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_COMPUTE:
|
||||
case MESA_SHADER_TASK:
|
||||
declare_global_input_sgprs(info, &user_sgpr_info, args);
|
||||
|
||||
if (info->cs.uses_sbt) {
|
||||
|
@ -551,6 +565,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
|
|||
ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.ray_launch_size);
|
||||
}
|
||||
|
||||
if (info->vs.needs_draw_id) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
|
||||
}
|
||||
|
||||
if (info->cs.uses_task_rings) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.task_ring_entry);
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->task_ib_addr);
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->task_ib_stride);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (info->cs.uses_block_id[i]) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
|
||||
|
@ -750,6 +774,9 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
|
|||
uint8_t user_sgpr_idx = 0;
|
||||
|
||||
set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx);
|
||||
if (stage == MESA_SHADER_TASK) {
|
||||
set_loc_shader_ptr(args, AC_UD_CS_TASK_RING_OFFSETS, &user_sgpr_idx);
|
||||
}
|
||||
|
||||
/* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
|
||||
* the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
|
||||
|
@ -765,6 +792,7 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
|
|||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_COMPUTE:
|
||||
case MESA_SHADER_TASK:
|
||||
if (args->ac.sbt_descriptors.used) {
|
||||
set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx);
|
||||
}
|
||||
|
@ -775,6 +803,16 @@ radv_declare_shader_args(enum chip_class chip_class, const struct radv_pipeline_
|
|||
if (args->ac.ray_launch_size.used) {
|
||||
set_loc_shader(args, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3);
|
||||
}
|
||||
if (args->ac.draw_id.used) {
|
||||
set_loc_shader(args, AC_UD_CS_TASK_DRAW_ID, &user_sgpr_idx, 1);
|
||||
}
|
||||
if (args->ac.task_ring_entry.used) {
|
||||
set_loc_shader(args, AC_UD_TASK_RING_ENTRY, &user_sgpr_idx, 1);
|
||||
}
|
||||
if (args->task_ib_addr.used) {
|
||||
assert(args->task_ib_stride.used);
|
||||
set_loc_shader(args, AC_UD_CS_TASK_IB, &user_sgpr_idx, 3);
|
||||
}
|
||||
break;
|
||||
case MESA_SHADER_VERTEX:
|
||||
if (args->ac.view_index.used)
|
||||
|
|
|
@ -36,7 +36,10 @@ struct radv_shader_args {
|
|||
struct ac_shader_args ac;
|
||||
|
||||
struct ac_arg descriptor_sets[MAX_SETS];
|
||||
/* User data 0/1. GFX: descriptor list, Compute: scratch BO */
|
||||
struct ac_arg ring_offsets;
|
||||
/* User data 2/3. same as the descriptor list above but for task shaders. */
|
||||
struct ac_arg task_ring_offsets;
|
||||
|
||||
/* Streamout */
|
||||
struct ac_arg streamout_buffers;
|
||||
|
@ -47,6 +50,10 @@ struct radv_shader_args {
|
|||
struct ac_arg ngg_viewport_scale[2];
|
||||
struct ac_arg ngg_viewport_translate[2];
|
||||
|
||||
/* Task shaders */
|
||||
struct ac_arg task_ib_addr;
|
||||
struct ac_arg task_ib_stride;
|
||||
|
||||
struct ac_arg prolog_inputs;
|
||||
struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS];
|
||||
|
||||
|
|
Loading…
Reference in New Issue