diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h index cf07c2fbe43..306ec92950d 100644 --- a/src/gallium/drivers/radeonsi/si_debug_options.h +++ b/src/gallium/drivers/radeonsi/si_debug_options.h @@ -1,3 +1,4 @@ +OPT_BOOL(inline_uniforms, false, "Optimize shaders by replacing uniforms with literals") OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context") OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)") OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps") diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 9ea61bef2ed..8f688fa3650 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1173,6 +1173,11 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad } si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER; } + + if (slot == 0) { + /* Invalidate current inlinable uniforms. */ + sctx->inlinable_uniforms_valid_mask &= ~(1 << shader); + } } slot = si_get_constbuf_slot(slot); @@ -1180,6 +1185,17 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad si_const_and_shader_buffer_descriptors_idx(shader), slot, input); } +static void si_set_inlinable_constants(struct pipe_context *ctx, + enum pipe_shader_type shader, + uint num_values, uint32_t *values) +{ + struct si_context *sctx = (struct si_context *)ctx; + + memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4); + sctx->inlinable_uniforms_dirty_mask |= 1 << shader; + sctx->inlinable_uniforms_valid_mask |= 1 << shader; +} + void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot, struct pipe_constant_buffer *cbuf) { @@ -2586,6 +2602,7 @@ void si_init_all_descriptors(struct si_context *sctx) sctx->b.bind_sampler_states = si_bind_sampler_states; sctx->b.set_shader_images = si_set_shader_images; sctx->b.set_constant_buffer = si_pipe_set_constant_buffer; + sctx->b.set_inlinable_constants = si_set_inlinable_constants; sctx->b.set_shader_buffers = si_set_shader_buffers; sctx->b.set_sampler_views = si_set_sampler_views; sctx->b.create_texture_handle = si_create_texture_handle; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index e5c6900c1d6..d6b86dc9f18 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1052,6 +1052,10 @@ struct si_context { unsigned descriptors_dirty; unsigned shader_pointers_dirty; unsigned shader_needs_decompress_mask; + unsigned shader_has_inlinable_uniforms_mask; + unsigned inlinable_uniforms_dirty_mask; + unsigned inlinable_uniforms_valid_mask; + uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS]; struct si_buffer_resources rw_buffers; struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS]; struct si_samplers samplers[SI_NUM_SHADERS]; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index f6a592fead9..302bd72666d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1591,7 +1591,9 @@ static bool si_should_optimize_less(struct ac_llvm_compiler *compiler, return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000; } -static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir) +static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, + const struct si_shader_key *key, + bool *free_nir) { nir_shader *nir; *free_nir = false; @@ -1611,6 +1613,60 @@ static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *f return NULL; } + if (key && key->opt.inline_uniforms) { + assert(*free_nir); + + /* Most places use shader information from the default variant, not + * the optimized variant. These are the things that the driver looks at + * in optimized variants and the list of things that we need to do. + * + * The driver takes into account these things if they suddenly disappear + * from the shader code: + * - Register usage and code size decrease (obvious) + * - Eliminated PS system values are disabled by LLVM + * (FragCoord, FrontFace, barycentrics) + * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef. + * (thanks to an LLVM pass in Mesa - TODO: move it to NIR) + * The storage for eliminated outputs is also not allocated. + * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM) + * - TCS output stores are eliminated + * + * TODO: These are things the driver ignores in the final shader code + * and relies on the default shader info. + * - Other system values are not eliminated + * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs + * to remove holes + * - uses_discard - if it changed to false + * - writes_memory - if it changed to false + * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not + * eliminated + * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS) + * GS outputs are eliminated except for the temporary LDS. + * Clip distances, gl_PointSize, and PS outputs are eliminated based + * on current states, so we don't care about the shader code. + * + * TODO: Merged shaders don't inline uniforms for the first stage. + * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS. + * (key == NULL for the first stage here) + * + * TODO: Compute shaders don't support inlinable uniforms, because they + * don't have shader variants. + * + * TODO: The driver uses a linear search to find a shader variant. This + * can be really slow if we get too many variants due to uniform inlining. + */ + NIR_PASS_V(nir, nir_inline_uniforms, + nir->info.num_inlinable_uniforms, + key->opt.inlined_uniform_values, + nir->info.inlinable_uniform_dw_offsets); + + si_nir_opts(sel->screen, nir, true); + + /* This must be done again. */ + NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | + nir_var_shader_out); + } + return nir; } @@ -1697,7 +1753,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com parts[3] = ctx.main_fn; /* VS as LS main part */ - nir = get_nir_shader(ls, &free_nir); + nir = get_nir_shader(ls, NULL, &free_nir); struct si_shader shader_ls = {}; shader_ls.selector = ls; shader_ls.key.as_ls = 1; @@ -1759,7 +1815,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com gs_prolog = ctx.main_fn; /* ES main part */ - nir = get_nir_shader(es, &free_nir); + nir = get_nir_shader(es, NULL, &free_nir); struct si_shader shader_es = {}; shader_es.selector = es; shader_es.key.as_es = 1; @@ -1849,7 +1905,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi { struct si_shader_selector *sel = shader->selector; bool free_nir; - struct nir_shader *nir = get_nir_shader(sel, &free_nir); + struct nir_shader *nir = get_nir_shader(sel, &shader->key, &free_nir); /* Dump NIR before doing NIR->LLVM conversion in case the * conversion fails. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index dda56066615..4c523efe657 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -418,6 +418,7 @@ struct si_shader_selector { struct pipe_stream_output_info so; struct si_shader_info info; + enum pipe_shader_type pipe_shader_type; ubyte const_and_shader_buf_descriptors_index; ubyte sampler_and_images_descriptors_index; bool vs_needs_prolog; @@ -672,6 +673,9 @@ struct si_shader_key { unsigned cs_cull_back : 1; unsigned cs_cull_z : 1; unsigned cs_halfz_clip_space : 1; + unsigned inline_uniforms:1; + + uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS]; } opt; }; @@ -847,6 +851,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, /* si_shader_nir.c */ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info); +void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first); void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize); /* si_state_shaders.c */ diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 472243cfc44..eddf4383bdc 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -428,7 +428,7 @@ static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data) return true; } -static void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) +void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) { bool progress; @@ -723,4 +723,7 @@ void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize) si_lower_io(nir); si_lower_nir(sscreen, nir); nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + if (sscreen->options.inline_uniforms) + nir_find_inlinable_uniforms(nir); } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 43bfe41f53d..f9fc4728b86 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -2032,6 +2032,14 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i sctx->do_update_shaders = true; } + if (sctx->shader_has_inlinable_uniforms_mask & + sctx->inlinable_uniforms_valid_mask & + sctx->inlinable_uniforms_dirty_mask) { + sctx->do_update_shaders = true; + /* If inlinable uniforms are not valid, they are also not dirty, so clear all bits. */ + sctx->inlinable_uniforms_dirty_mask = 0; + } + if (unlikely(sctx->do_update_shaders && !si_update_shaders(sctx))) goto return_cleanup; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index b22a9d5106c..bdb151ee96a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1833,6 +1833,15 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh memset(key, 0, sizeof(*key)); + unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms; + if (num_inlinable_uniforms && + sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) { + key->opt.inline_uniforms = true; + memcpy(key->opt.inlined_uniform_values, + sctx->inlinable_uniforms[sel->pipe_shader_type], + num_inlinable_uniforms * 4); + } + switch (sel->info.stage) { case MESA_SHADER_VERTEX: si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog); @@ -2635,6 +2644,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, si_nir_scan_shader(sel->nir, &sel->info); const enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->info.stage); + sel->pipe_shader_type = type; sel->const_and_shader_buf_descriptors_index = si_const_and_shader_buffer_descriptors_idx(type); sel->sampler_and_images_descriptors_index = @@ -2931,7 +2941,8 @@ static void si_update_clip_regs(struct si_context *sctx, struct si_shader_select si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs); } -static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel) +static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel, + enum pipe_shader_type type) { si_set_active_descriptors_for_shader(sctx, sel); @@ -2945,6 +2956,15 @@ static void si_update_common_shader_state(struct si_context *sctx, struct si_sha si_shader_uses_bindless_images(sctx->ps_shader.cso) || si_shader_uses_bindless_images(sctx->tcs_shader.cso) || si_shader_uses_bindless_images(sctx->tes_shader.cso); + + if (sel && sel->info.base.num_inlinable_uniforms) + sctx->shader_has_inlinable_uniforms_mask |= 1 << type; + else + sctx->shader_has_inlinable_uniforms_mask &= ~(1 << type); + + /* Invalidate inlinable uniforms. */ + sctx->inlinable_uniforms_valid_mask &= ~(1 << type); + sctx->do_update_shaders = true; } @@ -2965,7 +2985,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) if (si_update_ngg(sctx)) si_shader_change_notify(sctx); - si_update_common_shader_state(sctx, sel); + si_update_common_shader_state(sctx, sel, PIPE_SHADER_VERTEX); si_update_vs_viewport_state(sctx); si_update_streamout_state(sctx); si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso, @@ -3030,7 +3050,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state) sctx->gs_shader.current = sel ? sel->first_variant : NULL; sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL; - si_update_common_shader_state(sctx, sel); + si_update_common_shader_state(sctx, sel, PIPE_SHADER_GEOMETRY); sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ ngg_changed = si_update_ngg(sctx); @@ -3059,7 +3079,7 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) sctx->tcs_shader.current = sel ? sel->first_variant : NULL; si_update_tess_uses_prim_id(sctx); - si_update_common_shader_state(sctx, sel); + si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL); if (enable_changed) sctx->last_tcs = NULL; /* invalidate derived tess state */ @@ -3081,7 +3101,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL; si_update_tess_uses_prim_id(sctx); - si_update_common_shader_state(sctx, sel); + si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL); sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */ bool ngg_changed = si_update_ngg(sctx); @@ -3108,7 +3128,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) sctx->ps_shader.cso = sel; sctx->ps_shader.current = sel ? sel->first_variant : NULL; - si_update_common_shader_state(sctx, sel); + si_update_common_shader_state(sctx, sel, PIPE_SHADER_FRAGMENT); if (sel) { if (sctx->ia_multi_vgt_param_key.u.uses_tess) si_update_tess_uses_prim_id(sctx);