radeonsi: implement inlinable uniforms

This improves performance for uber shaders.

It must be enabled using the new driconf option.

The driver compiles the specialized shaders in another thread without stalls,
same as all other optimizations.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7057>
This commit is contained in:
Marek Olšák 2020-09-29 17:43:24 -04:00 committed by Marge Bot
parent 6d058ac6c9
commit b7501184b9
8 changed files with 125 additions and 11 deletions

View File

@ -1,3 +1,4 @@
OPT_BOOL(inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps")

View File

@ -1173,6 +1173,11 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad
}
si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
}
if (slot == 0) {
/* Invalidate current inlinable uniforms. */
sctx->inlinable_uniforms_valid_mask &= ~(1 << shader);
}
}
slot = si_get_constbuf_slot(slot);
@ -1180,6 +1185,17 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad
si_const_and_shader_buffer_descriptors_idx(shader), slot, input);
}
static void si_set_inlinable_constants(struct pipe_context *ctx,
enum pipe_shader_type shader,
uint num_values, uint32_t *values)
{
struct si_context *sctx = (struct si_context *)ctx;
memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4);
sctx->inlinable_uniforms_dirty_mask |= 1 << shader;
sctx->inlinable_uniforms_valid_mask |= 1 << shader;
}
void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
struct pipe_constant_buffer *cbuf)
{
@ -2586,6 +2602,7 @@ void si_init_all_descriptors(struct si_context *sctx)
sctx->b.bind_sampler_states = si_bind_sampler_states;
sctx->b.set_shader_images = si_set_shader_images;
sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
sctx->b.set_inlinable_constants = si_set_inlinable_constants;
sctx->b.set_shader_buffers = si_set_shader_buffers;
sctx->b.set_sampler_views = si_set_sampler_views;
sctx->b.create_texture_handle = si_create_texture_handle;

View File

@ -1052,6 +1052,10 @@ struct si_context {
unsigned descriptors_dirty;
unsigned shader_pointers_dirty;
unsigned shader_needs_decompress_mask;
unsigned shader_has_inlinable_uniforms_mask;
unsigned inlinable_uniforms_dirty_mask;
unsigned inlinable_uniforms_valid_mask;
uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS];
struct si_buffer_resources rw_buffers;
struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
struct si_samplers samplers[SI_NUM_SHADERS];

View File

@ -1591,7 +1591,9 @@ static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
}
static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
const struct si_shader_key *key,
bool *free_nir)
{
nir_shader *nir;
*free_nir = false;
@ -1611,6 +1613,60 @@ static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *f
return NULL;
}
if (key && key->opt.inline_uniforms) {
assert(*free_nir);
/* Most places use shader information from the default variant, not
* the optimized variant. These are the things that the driver looks at
* in optimized variants and the list of things that we need to do.
*
* The driver takes into account these things if they suddenly disappear
* from the shader code:
* - Register usage and code size decrease (obvious)
* - Eliminated PS system values are disabled by LLVM
* (FragCoord, FrontFace, barycentrics)
* - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
* (thanks to an LLVM pass in Mesa - TODO: move it to NIR)
* The storage for eliminated outputs is also not allocated.
* - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
* - TCS output stores are eliminated
*
* TODO: These are things the driver ignores in the final shader code
* and relies on the default shader info.
* - Other system values are not eliminated
* - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs
* to remove holes
* - uses_discard - if it changed to false
* - writes_memory - if it changed to false
* - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
* eliminated
* - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
* GS outputs are eliminated except for the temporary LDS.
* Clip distances, gl_PointSize, and PS outputs are eliminated based
* on current states, so we don't care about the shader code.
*
* TODO: Merged shaders don't inline uniforms for the first stage.
* VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
* (key == NULL for the first stage here)
*
* TODO: Compute shaders don't support inlinable uniforms, because they
* don't have shader variants.
*
* TODO: The driver uses a linear search to find a shader variant. This
* can be really slow if we get too many variants due to uniform inlining.
*/
NIR_PASS_V(nir, nir_inline_uniforms,
nir->info.num_inlinable_uniforms,
key->opt.inlined_uniform_values,
nir->info.inlinable_uniform_dw_offsets);
si_nir_opts(sel->screen, nir, true);
/* This must be done again. */
NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in |
nir_var_shader_out);
}
return nir;
}
@ -1697,7 +1753,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
parts[3] = ctx.main_fn;
/* VS as LS main part */
nir = get_nir_shader(ls, &free_nir);
nir = get_nir_shader(ls, NULL, &free_nir);
struct si_shader shader_ls = {};
shader_ls.selector = ls;
shader_ls.key.as_ls = 1;
@ -1759,7 +1815,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
gs_prolog = ctx.main_fn;
/* ES main part */
nir = get_nir_shader(es, &free_nir);
nir = get_nir_shader(es, NULL, &free_nir);
struct si_shader shader_es = {};
shader_es.selector = es;
shader_es.key.as_es = 1;
@ -1849,7 +1905,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
{
struct si_shader_selector *sel = shader->selector;
bool free_nir;
struct nir_shader *nir = get_nir_shader(sel, &free_nir);
struct nir_shader *nir = get_nir_shader(sel, &shader->key, &free_nir);
/* Dump NIR before doing NIR->LLVM conversion in case the
* conversion fails. */

View File

@ -418,6 +418,7 @@ struct si_shader_selector {
struct pipe_stream_output_info so;
struct si_shader_info info;
enum pipe_shader_type pipe_shader_type;
ubyte const_and_shader_buf_descriptors_index;
ubyte sampler_and_images_descriptors_index;
bool vs_needs_prolog;
@ -672,6 +673,9 @@ struct si_shader_key {
unsigned cs_cull_back : 1;
unsigned cs_cull_z : 1;
unsigned cs_halfz_clip_space : 1;
unsigned inline_uniforms:1;
uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
} opt;
};
@ -847,6 +851,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
/* si_shader_nir.c */
void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize);
/* si_state_shaders.c */

View File

@ -428,7 +428,7 @@ static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
return true;
}
static void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
{
bool progress;
@ -723,4 +723,7 @@ void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize)
si_lower_io(nir);
si_lower_nir(sscreen, nir);
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
if (sscreen->options.inline_uniforms)
nir_find_inlinable_uniforms(nir);
}

View File

@ -2032,6 +2032,14 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
sctx->do_update_shaders = true;
}
if (sctx->shader_has_inlinable_uniforms_mask &
sctx->inlinable_uniforms_valid_mask &
sctx->inlinable_uniforms_dirty_mask) {
sctx->do_update_shaders = true;
/* If inlinable uniforms are not valid, they are also not dirty, so clear all bits. */
sctx->inlinable_uniforms_dirty_mask = 0;
}
if (unlikely(sctx->do_update_shaders && !si_update_shaders(sctx)))
goto return_cleanup;

View File

@ -1833,6 +1833,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh
memset(key, 0, sizeof(*key));
unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms;
if (num_inlinable_uniforms &&
sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) {
key->opt.inline_uniforms = true;
memcpy(key->opt.inlined_uniform_values,
sctx->inlinable_uniforms[sel->pipe_shader_type],
num_inlinable_uniforms * 4);
}
switch (sel->info.stage) {
case MESA_SHADER_VERTEX:
si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
@ -2635,6 +2644,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
si_nir_scan_shader(sel->nir, &sel->info);
const enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->info.stage);
sel->pipe_shader_type = type;
sel->const_and_shader_buf_descriptors_index =
si_const_and_shader_buffer_descriptors_idx(type);
sel->sampler_and_images_descriptors_index =
@ -2931,7 +2941,8 @@ static void si_update_clip_regs(struct si_context *sctx, struct si_shader_select
si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
}
static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel)
static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel,
enum pipe_shader_type type)
{
si_set_active_descriptors_for_shader(sctx, sel);
@ -2945,6 +2956,15 @@ static void si_update_common_shader_state(struct si_context *sctx, struct si_sha
si_shader_uses_bindless_images(sctx->ps_shader.cso) ||
si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
si_shader_uses_bindless_images(sctx->tes_shader.cso);
if (sel && sel->info.base.num_inlinable_uniforms)
sctx->shader_has_inlinable_uniforms_mask |= 1 << type;
else
sctx->shader_has_inlinable_uniforms_mask &= ~(1 << type);
/* Invalidate inlinable uniforms. */
sctx->inlinable_uniforms_valid_mask &= ~(1 << type);
sctx->do_update_shaders = true;
}
@ -2965,7 +2985,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
if (si_update_ngg(sctx))
si_shader_change_notify(sctx);
si_update_common_shader_state(sctx, sel);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_VERTEX);
si_update_vs_viewport_state(sctx);
si_update_streamout_state(sctx);
si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
@ -3030,7 +3050,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
sctx->gs_shader.current = sel ? sel->first_variant : NULL;
sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
si_update_common_shader_state(sctx, sel);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_GEOMETRY);
sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
ngg_changed = si_update_ngg(sctx);
@ -3059,7 +3079,7 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
si_update_tess_uses_prim_id(sctx);
si_update_common_shader_state(sctx, sel);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL);
if (enable_changed)
sctx->last_tcs = NULL; /* invalidate derived tess state */
@ -3081,7 +3101,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
si_update_tess_uses_prim_id(sctx);
si_update_common_shader_state(sctx, sel);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
bool ngg_changed = si_update_ngg(sctx);
@ -3108,7 +3128,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
sctx->ps_shader.cso = sel;
sctx->ps_shader.current = sel ? sel->first_variant : NULL;
si_update_common_shader_state(sctx, sel);
si_update_common_shader_state(sctx, sel, PIPE_SHADER_FRAGMENT);
if (sel) {
if (sctx->ia_multi_vgt_param_key.u.uses_tess)
si_update_tess_uses_prim_id(sctx);