From 7f6262bb85cbe39472f2b26d812629368a0eae3b Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 29 Jul 2021 16:47:44 +0100 Subject: [PATCH] radv: allow holes in inline push constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a dword mask instead of a range to track which push constants to inline. fossil-db (Sienna Cichlid): Totals from 5724 (4.25% of 134621) affected shaders: CodeSize: 20894044 -> 20815748 (-0.37%); split: -0.39%, +0.02% Instrs: 4002568 -> 3988385 (-0.35%); split: -0.38%, +0.02% Latency: 29285060 -> 29224414 (-0.21%); split: -0.22%, +0.01% InvThroughput: 5529700 -> 5526893 (-0.05%); split: -0.05%, +0.00% VClause: 78093 -> 78240 (+0.19%); split: -0.23%, +0.41% SClause: 135495 -> 131027 (-3.30%); split: -3.30%, +0.00% Copies: 330856 -> 324552 (-1.91%); split: -2.37%, +0.46% PreSGPRs: 226031 -> 224778 (-0.55%); split: -0.61%, +0.05% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/.clang-format | 3 +- src/amd/common/ac_shader_args.h | 2 +- .../compiler/aco_instruction_selection.cpp | 13 ++-- src/amd/llvm/ac_nir_to_llvm.c | 22 +++---- src/amd/vulkan/radv_cmd_buffer.c | 21 ++++++- src/amd/vulkan/radv_pipeline.c | 6 ++ src/amd/vulkan/radv_shader.c | 1 + src/amd/vulkan/radv_shader.h | 6 +- src/amd/vulkan/radv_shader_args.c | 59 ++++++++----------- src/amd/vulkan/radv_shader_info.c | 26 ++++---- 10 files changed, 81 insertions(+), 78 deletions(-) diff --git a/src/amd/.clang-format b/src/amd/.clang-format index 6cf07d558fc..db9ecef76ce 100644 --- a/src/amd/.clang-format +++ b/src/amd/.clang-format @@ -52,7 +52,8 @@ ForEachMacros: - nir_foreach_variable_in_list - nir_foreach_src - foreach_two_lists - - foreach_bit + - u_foreach_bit + - u_foreach_bit64 - foreach_sched_node - foreach_src - foreach_src_n diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h index e0430efe015..e8d743cd961 100644 --- a/src/amd/common/ac_shader_args.h +++ b/src/amd/common/ac_shader_args.h @@ -139,7 +139,7 @@ struct ac_shader_args { /* Vulkan only */ struct ac_arg push_constants; struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; - unsigned base_inline_push_consts; + uint64_t inline_push_const_mask; struct ac_arg view_index; struct ac_arg sbt_descriptors; struct ac_arg ray_launch_size; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index cc30bfbdcc6..e29817b8450 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5501,18 +5501,17 @@ visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr) nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]); if (index_cv && instr->dest.ssa.bit_size == 32) { - const struct radv_userdata_info *loc = - &ctx->program->info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS]; unsigned start = (offset + index_cv->u32) / 4u; - unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0; - - start -= ctx->program->info->min_push_constant_used / 4; - if (start + count <= num_inline_push_consts) { + uint64_t mask = BITFIELD64_MASK(count) << start; + if ((ctx->args->ac.inline_push_const_mask | mask) == ctx->args->ac.inline_push_const_mask && + start + count <= (sizeof(ctx->args->ac.inline_push_const_mask) * 8u)) { std::array elems; aco_ptr vec{create_instruction( aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; + unsigned arg_index = + util_bitcount64(ctx->args->ac.inline_push_const_mask & BITFIELD64_MASK(start)); for (unsigned i = 0; i < count; ++i) { - elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]); + elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[arg_index++]); vec->operands[i] = Operand{elems[i]}; } vec->definitions[0] = Definition(dst); diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 1789073b172..1b3caabb6ad 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -1662,19 +1662,15 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_int offset += LLVMConstIntGetZExtValue(src0); offset /= 4; - offset -= ctx->args->base_inline_push_consts; - - unsigned num_inline_push_consts = 0; - for (unsigned i = 0; i < ARRAY_SIZE(ctx->args->inline_push_consts); i++) { - if (ctx->args->inline_push_consts[i].used) - num_inline_push_consts++; - } - - if (offset + count <= num_inline_push_consts) { - LLVMValueRef *const push_constants = alloca(num_inline_push_consts * sizeof(LLVMValueRef)); - for (unsigned i = 0; i < num_inline_push_consts; i++) - push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[i]); - return ac_build_gather_values(&ctx->ac, push_constants + offset, count); + uint64_t mask = BITFIELD64_MASK(count) << offset; + if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask && + offset + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) { + LLVMValueRef *const push_constants = alloca(count * sizeof(LLVMValueRef)); + unsigned arg_index = + util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(offset)); + for (unsigned i = 0; i < count; i++) + push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[arg_index++]); + return ac_build_gather_values(&ctx->ac, push_constants, count); } } diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index bedc13ce164..8e0f1493fc7 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -38,6 +38,7 @@ #include "vk_common_entrypoints.h" #include "ac_debug.h" +#include "ac_shader_args.h" #include "util/fast_idiv_by_const.h" @@ -3277,10 +3278,24 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag need_push_constants |= radv_shader_loads_push_constants(pipeline, stage); - uint8_t base = shader->info.min_push_constant_used / 4; + uint64_t mask = shader->info.inline_push_constant_mask; + if (!mask) + continue; - radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, - (uint32_t *)cmd_buffer->push_constants + base); + uint8_t base = ffs(mask) - 1; + if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) { + /* consecutive inline push constants */ + radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, + (uint32_t *)cmd_buffer->push_constants + base); + } else { + /* sparse inline push constants */ + uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS]; + unsigned num_consts = 0; + u_foreach_bit64 (idx, mask) + consts[num_consts++] = ((uint32_t *)cmd_buffer->push_constants)[idx]; + radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, + consts); + } } if (need_push_constants) { diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 69efd890996..c93123d7ea2 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3408,6 +3408,8 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag MESA_SHADER_TESS_CTRL, true, MESA_SHADER_VERTEX, &stages[MESA_SHADER_TESS_CTRL].args); stages[MESA_SHADER_TESS_CTRL].info.user_sgprs_locs = stages[MESA_SHADER_TESS_CTRL].args.user_sgprs_locs; + stages[MESA_SHADER_TESS_CTRL].info.inline_push_constant_mask = + stages[MESA_SHADER_TESS_CTRL].args.ac.inline_push_const_mask; stages[MESA_SHADER_VERTEX].args = stages[MESA_SHADER_TESS_CTRL].args; active_stages &= ~(1 << MESA_SHADER_VERTEX); @@ -3420,6 +3422,8 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag radv_declare_shader_args(chip_class, pipeline_key, &stages[MESA_SHADER_GEOMETRY].info, MESA_SHADER_GEOMETRY, true, pre_stage, &stages[MESA_SHADER_GEOMETRY].args); stages[MESA_SHADER_GEOMETRY].info.user_sgprs_locs = stages[MESA_SHADER_GEOMETRY].args.user_sgprs_locs; + stages[MESA_SHADER_GEOMETRY].info.inline_push_constant_mask = + stages[MESA_SHADER_GEOMETRY].args.ac.inline_push_const_mask; stages[pre_stage].args = stages[MESA_SHADER_GEOMETRY].args; active_stages &= ~(1 << pre_stage); @@ -3430,6 +3434,7 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag radv_declare_shader_args(chip_class, pipeline_key, &stages[i].info, i, false, MESA_SHADER_VERTEX, &stages[i].args); stages[i].info.user_sgprs_locs = stages[i].args.user_sgprs_locs; + stages[i].info.inline_push_constant_mask = stages[i].args.ac.inline_push_const_mask; } } @@ -4474,6 +4479,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout radv_declare_shader_args(device->physical_device->rad_info.chip_class, pipeline_key, &info, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX, &gs_copy_args); info.user_sgprs_locs = gs_copy_args.user_sgprs_locs; + info.inline_push_constant_mask = gs_copy_args.ac.inline_push_const_mask; pipeline->gs_copy_shader = radv_create_gs_copy_shader( device, stages[MESA_SHADER_GEOMETRY].nir, &info, &gs_copy_args, &gs_copy_binary, diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index c7e2c1bde08..b1cdbb3d194 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2167,6 +2167,7 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke key->next_stage != MESA_SHADER_VERTEX, MESA_SHADER_VERTEX, &args); info.user_sgprs_locs = args.user_sgprs_locs; + info.inline_push_constant_mask = args.ac.inline_push_const_mask; #ifdef LLVM_AVAILABLE if (options.dump_shader || options.record_ir) diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 6ceff1e0757..6cccc10e34d 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -229,12 +229,10 @@ struct gfx10_ngg_info { }; struct radv_shader_info { + uint64_t inline_push_constant_mask; + bool can_inline_all_push_constants; bool loads_push_constants; bool loads_dynamic_offsets; - uint16_t min_push_constant_used; - uint16_t max_push_constant_used; - bool has_only_32bit_push_constants; - bool has_indirect_push_constants; uint32_t desc_set_used_mask; bool uses_view_index; bool uses_invocation_id; diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c index e9b3d8e9a0c..6e3c56558c5 100644 --- a/src/amd/vulkan/radv_shader_args.c +++ b/src/amd/vulkan/radv_shader_args.c @@ -68,10 +68,10 @@ set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx) } struct user_sgpr_info { + uint64_t inline_push_constant_mask; + bool inlined_all_push_consts; bool indirect_all_descriptor_sets; uint8_t remaining_sgprs; - unsigned num_inline_push_consts; - bool inlined_all_push_consts; }; static uint8_t @@ -119,32 +119,29 @@ allocate_inline_push_consts(const struct radv_shader_info *info, { uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; - /* Only supported if shaders use push constants. */ - if (info->min_push_constant_used == UINT16_MAX) + if (!info->inline_push_constant_mask) return; - uint8_t num_push_consts = - (info->max_push_constant_used - info->min_push_constant_used) / 4; + uint64_t mask = info->inline_push_constant_mask; + uint8_t num_push_consts = util_bitcount64(mask); - /* Check if the number of user SGPRs is large enough. */ - if (num_push_consts < remaining_sgprs) { - user_sgpr_info->num_inline_push_consts = num_push_consts; - } else { - user_sgpr_info->num_inline_push_consts = remaining_sgprs; - } - - /* Clamp to the maximum number of allowed inlined push constants. */ - if (user_sgpr_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) - user_sgpr_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; - - if (user_sgpr_info->num_inline_push_consts == num_push_consts && - info->has_only_32bit_push_constants && !info->has_indirect_push_constants && - !info->loads_dynamic_offsets) { - /* Disable the default push constants path if all constants are - * inlined and if shaders don't use dynamic descriptors. - */ + /* Disable the default push constants path if all constants can be inlined and if shaders don't + * use dynamic descriptors. + */ + if (num_push_consts <= MIN2(remaining_sgprs + 1, AC_MAX_INLINE_PUSH_CONSTS) && + info->can_inline_all_push_constants && !info->loads_dynamic_offsets) { user_sgpr_info->inlined_all_push_consts = true; + remaining_sgprs++; + } else { + /* Clamp to the maximum number of allowed inlined push constants. */ + while (num_push_consts > MIN2(remaining_sgprs, AC_MAX_INLINE_PUSH_CONSTS)) { + num_push_consts--; + mask &= ~BITFIELD64_BIT(util_last_bit64(mask) - 1); + } } + + user_sgpr_info->remaining_sgprs = remaining_sgprs - util_bitcount64(mask); + user_sgpr_info->inline_push_constant_mask = mask; } static void @@ -252,10 +249,10 @@ declare_global_input_sgprs(const struct radv_shader_info *info, ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants); } - for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++) { + for (unsigned i = 0; i < util_bitcount64(user_sgpr_info->inline_push_constant_mask); i++) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.inline_push_consts[i]); } - args->ac.base_inline_push_consts = info->min_push_constant_used / 4; + args->ac.inline_push_const_mask = user_sgpr_info->inline_push_constant_mask; if (info->so.num_outputs) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->streamout_buffers); @@ -451,8 +448,6 @@ static void set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info, uint8_t *user_sgpr_idx) { - unsigned num_inline_push_consts = 0; - if (!user_sgpr_info->indirect_all_descriptor_sets) { for (unsigned i = 0; i < ARRAY_SIZE(args->descriptor_sets); i++) { if (args->descriptor_sets[i].used) @@ -466,13 +461,9 @@ set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info set_loc_shader_ptr(args, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); } - for (unsigned i = 0; i < ARRAY_SIZE(args->ac.inline_push_consts); i++) { - if (args->ac.inline_push_consts[i].used) - num_inline_push_consts++; - } - - if (num_inline_push_consts) { - set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, num_inline_push_consts); + if (user_sgpr_info->inline_push_constant_mask) { + set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, + util_bitcount64(user_sgpr_info->inline_push_constant_mask)); } if (args->streamout_buffers.used) { diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 4196d97fa37..baef1dc6f4a 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -96,22 +96,19 @@ static void gather_push_constant_info(const nir_shader *nir, const nir_intrinsic_instr *instr, struct radv_shader_info *info) { - int base = nir_intrinsic_base(instr); + info->loads_push_constants = true; - if (!nir_src_is_const(instr->src[0])) { - info->has_indirect_push_constants = true; - } else { - uint32_t min = base + nir_src_as_uint(instr->src[0]); - uint32_t max = min + instr->num_components * 4; + if (nir_src_is_const(instr->src[0]) && instr->dest.ssa.bit_size == 32) { + uint32_t start = (nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0])) / 4u; + uint32_t size = instr->num_components * (instr->dest.ssa.bit_size / 32u); - info->max_push_constant_used = MAX2(max, info->max_push_constant_used); - info->min_push_constant_used = MIN2(min, info->min_push_constant_used); + if (start + size <= (MAX_PUSH_CONSTANTS_SIZE / 4u)) { + info->inline_push_constant_mask |= u_bit_consecutive64(start, size); + return; + } } - if (instr->dest.ssa.bit_size != 32) - info->has_only_32bit_push_constants = false; - - info->loads_push_constants = true; + info->can_inline_all_push_constants = false; } static void @@ -621,9 +618,8 @@ assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask, void radv_nir_shader_info_init(struct radv_shader_info *info) { - /* Assume that shaders only have 32-bit push constants by default. */ - info->min_push_constant_used = UINT16_MAX; - info->has_only_32bit_push_constants = true; + /* Assume that shaders can inline all push constants by default. */ + info->can_inline_all_push_constants = true; } void