radeonsi: allow changing the NGG subgroup size to 256 but don't change it yet
Currently, 128 seems to have the best performance. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10813>
This commit is contained in:
parent
13acbaecd8
commit
9dc7fff448
|
@ -796,7 +796,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
|
||||||
struct si_shader_selector *sel = shader->selector;
|
struct si_shader_selector *sel = shader->selector;
|
||||||
struct si_shader_info *info = &sel->info;
|
struct si_shader_info *info = &sel->info;
|
||||||
LLVMBuilderRef builder = ctx->ac.builder;
|
LLVMBuilderRef builder = ctx->ac.builder;
|
||||||
unsigned subgroup_size = 128;
|
unsigned subgroup_size = ctx->screen->ngg_subgroup_size;
|
||||||
unsigned max_waves = ctx->ac.wave_size == 64 ? DIV_ROUND_UP(subgroup_size, 64) :
|
unsigned max_waves = ctx->ac.wave_size == 64 ? DIV_ROUND_UP(subgroup_size, 64) :
|
||||||
DIV_ROUND_UP(subgroup_size, 32);
|
DIV_ROUND_UP(subgroup_size, 32);
|
||||||
|
|
||||||
|
@ -2018,18 +2018,15 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
|
||||||
/* All these are per subgroup: */
|
/* All these are per subgroup: */
|
||||||
const unsigned min_esverts = gs_sel->screen->info.chip_class >= GFX10_3 ? 29 : 24;
|
const unsigned min_esverts = gs_sel->screen->info.chip_class >= GFX10_3 ? 29 : 24;
|
||||||
bool max_vert_out_per_gs_instance = false;
|
bool max_vert_out_per_gs_instance = false;
|
||||||
unsigned max_gsprims_base = 128; /* default prim group size clamp */
|
unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */
|
||||||
unsigned max_esverts_base = 128;
|
unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size;
|
||||||
|
|
||||||
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
|
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
|
||||||
/* Exactly 1 wave32 executes culling in primitive threads (there is no
|
/* All lanes are filled in wave32. */
|
||||||
* divergence), other waves are idle.
|
max_gsprims_base = ROUND_DOWN_TO(max_gsprims_base / 3, 32);
|
||||||
*/
|
|
||||||
max_gsprims_base = 32;
|
|
||||||
max_esverts_base = max_gsprims_base * 3;
|
max_esverts_base = max_gsprims_base * 3;
|
||||||
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
|
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
|
||||||
max_gsprims_base = 126;
|
max_gsprims_base = max_esverts_base - 2;
|
||||||
max_esverts_base = 128;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gs_stage == MESA_SHADER_GEOMETRY) {
|
if (gs_stage == MESA_SHADER_GEOMETRY) {
|
||||||
|
|
|
@ -1300,6 +1300,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sscreen->ngg_subgroup_size = 128;
|
||||||
sscreen->ge_wave_size = 64;
|
sscreen->ge_wave_size = 64;
|
||||||
sscreen->ps_wave_size = 64;
|
sscreen->ps_wave_size = 64;
|
||||||
sscreen->compute_wave_size = 64;
|
sscreen->compute_wave_size = 64;
|
||||||
|
|
|
@ -665,6 +665,7 @@ struct si_screen {
|
||||||
unsigned compute_wave_size;
|
unsigned compute_wave_size;
|
||||||
unsigned ps_wave_size;
|
unsigned ps_wave_size;
|
||||||
unsigned ge_wave_size;
|
unsigned ge_wave_size;
|
||||||
|
unsigned ngg_subgroup_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct si_sampler_view {
|
struct si_sampler_view {
|
||||||
|
|
Loading…
Reference in New Issue