radv: allow holes in inline push constants
Use a dword mask instead of a range to track which push constants to inline. fossil-db (Sienna Cichlid): Totals from 5724 (4.25% of 134621) affected shaders: CodeSize: 20894044 -> 20815748 (-0.37%); split: -0.39%, +0.02% Instrs: 4002568 -> 3988385 (-0.35%); split: -0.38%, +0.02% Latency: 29285060 -> 29224414 (-0.21%); split: -0.22%, +0.01% InvThroughput: 5529700 -> 5526893 (-0.05%); split: -0.05%, +0.00% VClause: 78093 -> 78240 (+0.19%); split: -0.23%, +0.41% SClause: 135495 -> 131027 (-3.30%); split: -3.30%, +0.00% Copies: 330856 -> 324552 (-1.91%); split: -2.37%, +0.46% PreSGPRs: 226031 -> 224778 (-0.55%); split: -0.61%, +0.05% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12145>
This commit is contained in:
parent
72cf6cca91
commit
7f6262bb85
|
@ -52,7 +52,8 @@ ForEachMacros:
|
|||
- nir_foreach_variable_in_list
|
||||
- nir_foreach_src
|
||||
- foreach_two_lists
|
||||
- foreach_bit
|
||||
- u_foreach_bit
|
||||
- u_foreach_bit64
|
||||
- foreach_sched_node
|
||||
- foreach_src
|
||||
- foreach_src_n
|
||||
|
|
|
@ -139,7 +139,7 @@ struct ac_shader_args {
|
|||
/* Vulkan only */
|
||||
struct ac_arg push_constants;
|
||||
struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
|
||||
unsigned base_inline_push_consts;
|
||||
uint64_t inline_push_const_mask;
|
||||
struct ac_arg view_index;
|
||||
struct ac_arg sbt_descriptors;
|
||||
struct ac_arg ray_launch_size;
|
||||
|
|
|
@ -5501,18 +5501,17 @@ visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
|
||||
|
||||
if (index_cv && instr->dest.ssa.bit_size == 32) {
|
||||
const struct radv_userdata_info *loc =
|
||||
&ctx->program->info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS];
|
||||
unsigned start = (offset + index_cv->u32) / 4u;
|
||||
unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0;
|
||||
|
||||
start -= ctx->program->info->min_push_constant_used / 4;
|
||||
if (start + count <= num_inline_push_consts) {
|
||||
uint64_t mask = BITFIELD64_MASK(count) << start;
|
||||
if ((ctx->args->ac.inline_push_const_mask | mask) == ctx->args->ac.inline_push_const_mask &&
|
||||
start + count <= (sizeof(ctx->args->ac.inline_push_const_mask) * 8u)) {
|
||||
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
|
||||
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
|
||||
unsigned arg_index =
|
||||
util_bitcount64(ctx->args->ac.inline_push_const_mask & BITFIELD64_MASK(start));
|
||||
for (unsigned i = 0; i < count; ++i) {
|
||||
elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
|
||||
elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[arg_index++]);
|
||||
vec->operands[i] = Operand{elems[i]};
|
||||
}
|
||||
vec->definitions[0] = Definition(dst);
|
||||
|
|
|
@ -1662,19 +1662,15 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_int
|
|||
offset += LLVMConstIntGetZExtValue(src0);
|
||||
offset /= 4;
|
||||
|
||||
offset -= ctx->args->base_inline_push_consts;
|
||||
|
||||
unsigned num_inline_push_consts = 0;
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(ctx->args->inline_push_consts); i++) {
|
||||
if (ctx->args->inline_push_consts[i].used)
|
||||
num_inline_push_consts++;
|
||||
}
|
||||
|
||||
if (offset + count <= num_inline_push_consts) {
|
||||
LLVMValueRef *const push_constants = alloca(num_inline_push_consts * sizeof(LLVMValueRef));
|
||||
for (unsigned i = 0; i < num_inline_push_consts; i++)
|
||||
push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[i]);
|
||||
return ac_build_gather_values(&ctx->ac, push_constants + offset, count);
|
||||
uint64_t mask = BITFIELD64_MASK(count) << offset;
|
||||
if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
|
||||
offset + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
|
||||
LLVMValueRef *const push_constants = alloca(count * sizeof(LLVMValueRef));
|
||||
unsigned arg_index =
|
||||
util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(offset));
|
||||
for (unsigned i = 0; i < count; i++)
|
||||
push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[arg_index++]);
|
||||
return ac_build_gather_values(&ctx->ac, push_constants, count);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@
|
|||
#include "vk_common_entrypoints.h"
|
||||
|
||||
#include "ac_debug.h"
|
||||
#include "ac_shader_args.h"
|
||||
|
||||
#include "util/fast_idiv_by_const.h"
|
||||
|
||||
|
@ -3277,10 +3278,24 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
|
|||
|
||||
need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
|
||||
|
||||
uint8_t base = shader->info.min_push_constant_used / 4;
|
||||
uint64_t mask = shader->info.inline_push_constant_mask;
|
||||
if (!mask)
|
||||
continue;
|
||||
|
||||
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
|
||||
(uint32_t *)cmd_buffer->push_constants + base);
|
||||
uint8_t base = ffs(mask) - 1;
|
||||
if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
|
||||
/* consecutive inline push constants */
|
||||
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
|
||||
(uint32_t *)cmd_buffer->push_constants + base);
|
||||
} else {
|
||||
/* sparse inline push constants */
|
||||
uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
|
||||
unsigned num_consts = 0;
|
||||
u_foreach_bit64 (idx, mask)
|
||||
consts[num_consts++] = ((uint32_t *)cmd_buffer->push_constants)[idx];
|
||||
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
|
||||
consts);
|
||||
}
|
||||
}
|
||||
|
||||
if (need_push_constants) {
|
||||
|
|
|
@ -3408,6 +3408,8 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag
|
|||
MESA_SHADER_TESS_CTRL, true, MESA_SHADER_VERTEX,
|
||||
&stages[MESA_SHADER_TESS_CTRL].args);
|
||||
stages[MESA_SHADER_TESS_CTRL].info.user_sgprs_locs = stages[MESA_SHADER_TESS_CTRL].args.user_sgprs_locs;
|
||||
stages[MESA_SHADER_TESS_CTRL].info.inline_push_constant_mask =
|
||||
stages[MESA_SHADER_TESS_CTRL].args.ac.inline_push_const_mask;
|
||||
|
||||
stages[MESA_SHADER_VERTEX].args = stages[MESA_SHADER_TESS_CTRL].args;
|
||||
active_stages &= ~(1 << MESA_SHADER_VERTEX);
|
||||
|
@ -3420,6 +3422,8 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag
|
|||
radv_declare_shader_args(chip_class, pipeline_key, &stages[MESA_SHADER_GEOMETRY].info,
|
||||
MESA_SHADER_GEOMETRY, true, pre_stage, &stages[MESA_SHADER_GEOMETRY].args);
|
||||
stages[MESA_SHADER_GEOMETRY].info.user_sgprs_locs = stages[MESA_SHADER_GEOMETRY].args.user_sgprs_locs;
|
||||
stages[MESA_SHADER_GEOMETRY].info.inline_push_constant_mask =
|
||||
stages[MESA_SHADER_GEOMETRY].args.ac.inline_push_const_mask;
|
||||
|
||||
stages[pre_stage].args = stages[MESA_SHADER_GEOMETRY].args;
|
||||
active_stages &= ~(1 << pre_stage);
|
||||
|
@ -3430,6 +3434,7 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag
|
|||
radv_declare_shader_args(chip_class, pipeline_key, &stages[i].info, i, false, MESA_SHADER_VERTEX,
|
||||
&stages[i].args);
|
||||
stages[i].info.user_sgprs_locs = stages[i].args.user_sgprs_locs;
|
||||
stages[i].info.inline_push_constant_mask = stages[i].args.ac.inline_push_const_mask;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4474,6 +4479,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
|
|||
radv_declare_shader_args(device->physical_device->rad_info.chip_class, pipeline_key, &info,
|
||||
MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX, &gs_copy_args);
|
||||
info.user_sgprs_locs = gs_copy_args.user_sgprs_locs;
|
||||
info.inline_push_constant_mask = gs_copy_args.ac.inline_push_const_mask;
|
||||
|
||||
pipeline->gs_copy_shader = radv_create_gs_copy_shader(
|
||||
device, stages[MESA_SHADER_GEOMETRY].nir, &info, &gs_copy_args, &gs_copy_binary,
|
||||
|
|
|
@ -2167,6 +2167,7 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke
|
|||
key->next_stage != MESA_SHADER_VERTEX, MESA_SHADER_VERTEX, &args);
|
||||
|
||||
info.user_sgprs_locs = args.user_sgprs_locs;
|
||||
info.inline_push_constant_mask = args.ac.inline_push_const_mask;
|
||||
|
||||
#ifdef LLVM_AVAILABLE
|
||||
if (options.dump_shader || options.record_ir)
|
||||
|
|
|
@ -229,12 +229,10 @@ struct gfx10_ngg_info {
|
|||
};
|
||||
|
||||
struct radv_shader_info {
|
||||
uint64_t inline_push_constant_mask;
|
||||
bool can_inline_all_push_constants;
|
||||
bool loads_push_constants;
|
||||
bool loads_dynamic_offsets;
|
||||
uint16_t min_push_constant_used;
|
||||
uint16_t max_push_constant_used;
|
||||
bool has_only_32bit_push_constants;
|
||||
bool has_indirect_push_constants;
|
||||
uint32_t desc_set_used_mask;
|
||||
bool uses_view_index;
|
||||
bool uses_invocation_id;
|
||||
|
|
|
@ -68,10 +68,10 @@ set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
|
|||
}
|
||||
|
||||
struct user_sgpr_info {
|
||||
uint64_t inline_push_constant_mask;
|
||||
bool inlined_all_push_consts;
|
||||
bool indirect_all_descriptor_sets;
|
||||
uint8_t remaining_sgprs;
|
||||
unsigned num_inline_push_consts;
|
||||
bool inlined_all_push_consts;
|
||||
};
|
||||
|
||||
static uint8_t
|
||||
|
@ -119,32 +119,29 @@ allocate_inline_push_consts(const struct radv_shader_info *info,
|
|||
{
|
||||
uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
|
||||
|
||||
/* Only supported if shaders use push constants. */
|
||||
if (info->min_push_constant_used == UINT16_MAX)
|
||||
if (!info->inline_push_constant_mask)
|
||||
return;
|
||||
|
||||
uint8_t num_push_consts =
|
||||
(info->max_push_constant_used - info->min_push_constant_used) / 4;
|
||||
uint64_t mask = info->inline_push_constant_mask;
|
||||
uint8_t num_push_consts = util_bitcount64(mask);
|
||||
|
||||
/* Check if the number of user SGPRs is large enough. */
|
||||
if (num_push_consts < remaining_sgprs) {
|
||||
user_sgpr_info->num_inline_push_consts = num_push_consts;
|
||||
} else {
|
||||
user_sgpr_info->num_inline_push_consts = remaining_sgprs;
|
||||
}
|
||||
|
||||
/* Clamp to the maximum number of allowed inlined push constants. */
|
||||
if (user_sgpr_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
|
||||
user_sgpr_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
|
||||
|
||||
if (user_sgpr_info->num_inline_push_consts == num_push_consts &&
|
||||
info->has_only_32bit_push_constants && !info->has_indirect_push_constants &&
|
||||
!info->loads_dynamic_offsets) {
|
||||
/* Disable the default push constants path if all constants are
|
||||
* inlined and if shaders don't use dynamic descriptors.
|
||||
*/
|
||||
/* Disable the default push constants path if all constants can be inlined and if shaders don't
|
||||
* use dynamic descriptors.
|
||||
*/
|
||||
if (num_push_consts <= MIN2(remaining_sgprs + 1, AC_MAX_INLINE_PUSH_CONSTS) &&
|
||||
info->can_inline_all_push_constants && !info->loads_dynamic_offsets) {
|
||||
user_sgpr_info->inlined_all_push_consts = true;
|
||||
remaining_sgprs++;
|
||||
} else {
|
||||
/* Clamp to the maximum number of allowed inlined push constants. */
|
||||
while (num_push_consts > MIN2(remaining_sgprs, AC_MAX_INLINE_PUSH_CONSTS)) {
|
||||
num_push_consts--;
|
||||
mask &= ~BITFIELD64_BIT(util_last_bit64(mask) - 1);
|
||||
}
|
||||
}
|
||||
|
||||
user_sgpr_info->remaining_sgprs = remaining_sgprs - util_bitcount64(mask);
|
||||
user_sgpr_info->inline_push_constant_mask = mask;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -252,10 +249,10 @@ declare_global_input_sgprs(const struct radv_shader_info *info,
|
|||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++) {
|
||||
for (unsigned i = 0; i < util_bitcount64(user_sgpr_info->inline_push_constant_mask); i++) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.inline_push_consts[i]);
|
||||
}
|
||||
args->ac.base_inline_push_consts = info->min_push_constant_used / 4;
|
||||
args->ac.inline_push_const_mask = user_sgpr_info->inline_push_constant_mask;
|
||||
|
||||
if (info->so.num_outputs) {
|
||||
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->streamout_buffers);
|
||||
|
@ -451,8 +448,6 @@ static void
|
|||
set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info,
|
||||
uint8_t *user_sgpr_idx)
|
||||
{
|
||||
unsigned num_inline_push_consts = 0;
|
||||
|
||||
if (!user_sgpr_info->indirect_all_descriptor_sets) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(args->descriptor_sets); i++) {
|
||||
if (args->descriptor_sets[i].used)
|
||||
|
@ -466,13 +461,9 @@ set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info
|
|||
set_loc_shader_ptr(args, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(args->ac.inline_push_consts); i++) {
|
||||
if (args->ac.inline_push_consts[i].used)
|
||||
num_inline_push_consts++;
|
||||
}
|
||||
|
||||
if (num_inline_push_consts) {
|
||||
set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, num_inline_push_consts);
|
||||
if (user_sgpr_info->inline_push_constant_mask) {
|
||||
set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
|
||||
util_bitcount64(user_sgpr_info->inline_push_constant_mask));
|
||||
}
|
||||
|
||||
if (args->streamout_buffers.used) {
|
||||
|
|
|
@ -96,22 +96,19 @@ static void
|
|||
gather_push_constant_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
|
||||
struct radv_shader_info *info)
|
||||
{
|
||||
int base = nir_intrinsic_base(instr);
|
||||
info->loads_push_constants = true;
|
||||
|
||||
if (!nir_src_is_const(instr->src[0])) {
|
||||
info->has_indirect_push_constants = true;
|
||||
} else {
|
||||
uint32_t min = base + nir_src_as_uint(instr->src[0]);
|
||||
uint32_t max = min + instr->num_components * 4;
|
||||
if (nir_src_is_const(instr->src[0]) && instr->dest.ssa.bit_size == 32) {
|
||||
uint32_t start = (nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0])) / 4u;
|
||||
uint32_t size = instr->num_components * (instr->dest.ssa.bit_size / 32u);
|
||||
|
||||
info->max_push_constant_used = MAX2(max, info->max_push_constant_used);
|
||||
info->min_push_constant_used = MIN2(min, info->min_push_constant_used);
|
||||
if (start + size <= (MAX_PUSH_CONSTANTS_SIZE / 4u)) {
|
||||
info->inline_push_constant_mask |= u_bit_consecutive64(start, size);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (instr->dest.ssa.bit_size != 32)
|
||||
info->has_only_32bit_push_constants = false;
|
||||
|
||||
info->loads_push_constants = true;
|
||||
info->can_inline_all_push_constants = false;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -621,9 +618,8 @@ assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask,
|
|||
void
|
||||
radv_nir_shader_info_init(struct radv_shader_info *info)
|
||||
{
|
||||
/* Assume that shaders only have 32-bit push constants by default. */
|
||||
info->min_push_constant_used = UINT16_MAX;
|
||||
info->has_only_32bit_push_constants = true;
|
||||
/* Assume that shaders can inline all push constants by default. */
|
||||
info->can_inline_all_push_constants = true;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
Loading…
Reference in New Issue