From 9dc7fff4489f2f17fbba4c47d3252b7114de6a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 8 May 2021 07:55:05 -0400 Subject: [PATCH] radeonsi: allow changing the NGG subgroup size to 256 but don't change it yet Currently, 128 seems to have the best performance. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 15 ++++++--------- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 1 + 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index b1611704e57..d72c72e748a 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -796,7 +796,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out struct si_shader_selector *sel = shader->selector; struct si_shader_info *info = &sel->info; LLVMBuilderRef builder = ctx->ac.builder; - unsigned subgroup_size = 128; + unsigned subgroup_size = ctx->screen->ngg_subgroup_size; unsigned max_waves = ctx->ac.wave_size == 64 ? DIV_ROUND_UP(subgroup_size, 64) : DIV_ROUND_UP(subgroup_size, 32); @@ -2018,18 +2018,15 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) /* All these are per subgroup: */ const unsigned min_esverts = gs_sel->screen->info.chip_class >= GFX10_3 ? 29 : 24; bool max_vert_out_per_gs_instance = false; - unsigned max_gsprims_base = 128; /* default prim group size clamp */ - unsigned max_esverts_base = 128; + unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */ + unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size; if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) { - /* Exactly 1 wave32 executes culling in primitive threads (there is no - * divergence), other waves are idle. - */ - max_gsprims_base = 32; + /* All lanes are filled in wave32. */ + max_gsprims_base = ROUND_DOWN_TO(max_gsprims_base / 3, 32); max_esverts_base = max_gsprims_base * 3; } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) { - max_gsprims_base = 126; - max_esverts_base = 128; + max_gsprims_base = max_esverts_base - 2; } if (gs_stage == MESA_SHADER_GEOMETRY) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 6eb9f02fc92..6d6bb19e4fa 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1300,6 +1300,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, } } + sscreen->ngg_subgroup_size = 128; sscreen->ge_wave_size = 64; sscreen->ps_wave_size = 64; sscreen->compute_wave_size = 64; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 64217e528bc..b4c37565168 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -665,6 +665,7 @@ struct si_screen { unsigned compute_wave_size; unsigned ps_wave_size; unsigned ge_wave_size; + unsigned ngg_subgroup_size; }; struct si_sampler_view {