radv: allow holes in inline push constants

Use a dword mask instead of a range to track which push constants to
inline.

fossil-db (Sienna Cichlid):
Totals from 5724 (4.25% of 134621) affected shaders:
CodeSize: 20894044 -> 20815748 (-0.37%); split: -0.39%, +0.02%
Instrs: 4002568 -> 3988385 (-0.35%); split: -0.38%, +0.02%
Latency: 29285060 -> 29224414 (-0.21%); split: -0.22%, +0.01%
InvThroughput: 5529700 -> 5526893 (-0.05%); split: -0.05%, +0.00%
VClause: 78093 -> 78240 (+0.19%); split: -0.23%, +0.41%
SClause: 135495 -> 131027 (-3.30%); split: -3.30%, +0.00%
Copies: 330856 -> 324552 (-1.91%); split: -2.37%, +0.46%
PreSGPRs: 226031 -> 224778 (-0.55%); split: -0.61%, +0.05%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12145>
This commit is contained in:
Rhys Perry 2021-07-29 16:47:44 +01:00 committed by Marge Bot
parent 72cf6cca91
commit 7f6262bb85
10 changed files with 81 additions and 78 deletions

View File

@ -52,7 +52,8 @@ ForEachMacros:
- nir_foreach_variable_in_list
- nir_foreach_src
- foreach_two_lists
- foreach_bit
- u_foreach_bit
- u_foreach_bit64
- foreach_sched_node
- foreach_src
- foreach_src_n

View File

@ -139,7 +139,7 @@ struct ac_shader_args {
/* Vulkan only */
struct ac_arg push_constants;
struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
unsigned base_inline_push_consts;
uint64_t inline_push_const_mask;
struct ac_arg view_index;
struct ac_arg sbt_descriptors;
struct ac_arg ray_launch_size;

View File

@ -5501,18 +5501,17 @@ visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
if (index_cv && instr->dest.ssa.bit_size == 32) {
const struct radv_userdata_info *loc =
&ctx->program->info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS];
unsigned start = (offset + index_cv->u32) / 4u;
unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0;
start -= ctx->program->info->min_push_constant_used / 4;
if (start + count <= num_inline_push_consts) {
uint64_t mask = BITFIELD64_MASK(count) << start;
if ((ctx->args->ac.inline_push_const_mask | mask) == ctx->args->ac.inline_push_const_mask &&
start + count <= (sizeof(ctx->args->ac.inline_push_const_mask) * 8u)) {
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
unsigned arg_index =
util_bitcount64(ctx->args->ac.inline_push_const_mask & BITFIELD64_MASK(start));
for (unsigned i = 0; i < count; ++i) {
elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[arg_index++]);
vec->operands[i] = Operand{elems[i]};
}
vec->definitions[0] = Definition(dst);

View File

@ -1662,19 +1662,15 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_int
offset += LLVMConstIntGetZExtValue(src0);
offset /= 4;
offset -= ctx->args->base_inline_push_consts;
unsigned num_inline_push_consts = 0;
for (unsigned i = 0; i < ARRAY_SIZE(ctx->args->inline_push_consts); i++) {
if (ctx->args->inline_push_consts[i].used)
num_inline_push_consts++;
}
if (offset + count <= num_inline_push_consts) {
LLVMValueRef *const push_constants = alloca(num_inline_push_consts * sizeof(LLVMValueRef));
for (unsigned i = 0; i < num_inline_push_consts; i++)
push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[i]);
return ac_build_gather_values(&ctx->ac, push_constants + offset, count);
uint64_t mask = BITFIELD64_MASK(count) << offset;
if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
offset + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
LLVMValueRef *const push_constants = alloca(count * sizeof(LLVMValueRef));
unsigned arg_index =
util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(offset));
for (unsigned i = 0; i < count; i++)
push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[arg_index++]);
return ac_build_gather_values(&ctx->ac, push_constants, count);
}
}

View File

@ -38,6 +38,7 @@
#include "vk_common_entrypoints.h"
#include "ac_debug.h"
#include "ac_shader_args.h"
#include "util/fast_idiv_by_const.h"
@ -3277,10 +3278,24 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
uint8_t base = shader->info.min_push_constant_used / 4;
uint64_t mask = shader->info.inline_push_constant_mask;
if (!mask)
continue;
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
(uint32_t *)cmd_buffer->push_constants + base);
uint8_t base = ffs(mask) - 1;
if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
/* consecutive inline push constants */
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
(uint32_t *)cmd_buffer->push_constants + base);
} else {
/* sparse inline push constants */
uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
unsigned num_consts = 0;
u_foreach_bit64 (idx, mask)
consts[num_consts++] = ((uint32_t *)cmd_buffer->push_constants)[idx];
radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
consts);
}
}
if (need_push_constants) {

View File

@ -3408,6 +3408,8 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag
MESA_SHADER_TESS_CTRL, true, MESA_SHADER_VERTEX,
&stages[MESA_SHADER_TESS_CTRL].args);
stages[MESA_SHADER_TESS_CTRL].info.user_sgprs_locs = stages[MESA_SHADER_TESS_CTRL].args.user_sgprs_locs;
stages[MESA_SHADER_TESS_CTRL].info.inline_push_constant_mask =
stages[MESA_SHADER_TESS_CTRL].args.ac.inline_push_const_mask;
stages[MESA_SHADER_VERTEX].args = stages[MESA_SHADER_TESS_CTRL].args;
active_stages &= ~(1 << MESA_SHADER_VERTEX);
@ -3420,6 +3422,8 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag
radv_declare_shader_args(chip_class, pipeline_key, &stages[MESA_SHADER_GEOMETRY].info,
MESA_SHADER_GEOMETRY, true, pre_stage, &stages[MESA_SHADER_GEOMETRY].args);
stages[MESA_SHADER_GEOMETRY].info.user_sgprs_locs = stages[MESA_SHADER_GEOMETRY].args.user_sgprs_locs;
stages[MESA_SHADER_GEOMETRY].info.inline_push_constant_mask =
stages[MESA_SHADER_GEOMETRY].args.ac.inline_push_const_mask;
stages[pre_stage].args = stages[MESA_SHADER_GEOMETRY].args;
active_stages &= ~(1 << pre_stage);
@ -3430,6 +3434,7 @@ radv_declare_pipeline_args(struct radv_device *device, struct radv_pipeline_stag
radv_declare_shader_args(chip_class, pipeline_key, &stages[i].info, i, false, MESA_SHADER_VERTEX,
&stages[i].args);
stages[i].info.user_sgprs_locs = stages[i].args.user_sgprs_locs;
stages[i].info.inline_push_constant_mask = stages[i].args.ac.inline_push_const_mask;
}
}
@ -4474,6 +4479,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
radv_declare_shader_args(device->physical_device->rad_info.chip_class, pipeline_key, &info,
MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX, &gs_copy_args);
info.user_sgprs_locs = gs_copy_args.user_sgprs_locs;
info.inline_push_constant_mask = gs_copy_args.ac.inline_push_const_mask;
pipeline->gs_copy_shader = radv_create_gs_copy_shader(
device, stages[MESA_SHADER_GEOMETRY].nir, &info, &gs_copy_args, &gs_copy_binary,

View File

@ -2167,6 +2167,7 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke
key->next_stage != MESA_SHADER_VERTEX, MESA_SHADER_VERTEX, &args);
info.user_sgprs_locs = args.user_sgprs_locs;
info.inline_push_constant_mask = args.ac.inline_push_const_mask;
#ifdef LLVM_AVAILABLE
if (options.dump_shader || options.record_ir)

View File

@ -229,12 +229,10 @@ struct gfx10_ngg_info {
};
struct radv_shader_info {
uint64_t inline_push_constant_mask;
bool can_inline_all_push_constants;
bool loads_push_constants;
bool loads_dynamic_offsets;
uint16_t min_push_constant_used;
uint16_t max_push_constant_used;
bool has_only_32bit_push_constants;
bool has_indirect_push_constants;
uint32_t desc_set_used_mask;
bool uses_view_index;
bool uses_invocation_id;

View File

@ -68,10 +68,10 @@ set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
}
struct user_sgpr_info {
uint64_t inline_push_constant_mask;
bool inlined_all_push_consts;
bool indirect_all_descriptor_sets;
uint8_t remaining_sgprs;
unsigned num_inline_push_consts;
bool inlined_all_push_consts;
};
static uint8_t
@ -119,32 +119,29 @@ allocate_inline_push_consts(const struct radv_shader_info *info,
{
uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
/* Only supported if shaders use push constants. */
if (info->min_push_constant_used == UINT16_MAX)
if (!info->inline_push_constant_mask)
return;
uint8_t num_push_consts =
(info->max_push_constant_used - info->min_push_constant_used) / 4;
uint64_t mask = info->inline_push_constant_mask;
uint8_t num_push_consts = util_bitcount64(mask);
/* Check if the number of user SGPRs is large enough. */
if (num_push_consts < remaining_sgprs) {
user_sgpr_info->num_inline_push_consts = num_push_consts;
} else {
user_sgpr_info->num_inline_push_consts = remaining_sgprs;
}
/* Clamp to the maximum number of allowed inlined push constants. */
if (user_sgpr_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
user_sgpr_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
if (user_sgpr_info->num_inline_push_consts == num_push_consts &&
info->has_only_32bit_push_constants && !info->has_indirect_push_constants &&
!info->loads_dynamic_offsets) {
/* Disable the default push constants path if all constants are
* inlined and if shaders don't use dynamic descriptors.
*/
/* Disable the default push constants path if all constants can be inlined and if shaders don't
* use dynamic descriptors.
*/
if (num_push_consts <= MIN2(remaining_sgprs + 1, AC_MAX_INLINE_PUSH_CONSTS) &&
info->can_inline_all_push_constants && !info->loads_dynamic_offsets) {
user_sgpr_info->inlined_all_push_consts = true;
remaining_sgprs++;
} else {
/* Clamp to the maximum number of allowed inlined push constants. */
while (num_push_consts > MIN2(remaining_sgprs, AC_MAX_INLINE_PUSH_CONSTS)) {
num_push_consts--;
mask &= ~BITFIELD64_BIT(util_last_bit64(mask) - 1);
}
}
user_sgpr_info->remaining_sgprs = remaining_sgprs - util_bitcount64(mask);
user_sgpr_info->inline_push_constant_mask = mask;
}
static void
@ -252,10 +249,10 @@ declare_global_input_sgprs(const struct radv_shader_info *info,
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants);
}
for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++) {
for (unsigned i = 0; i < util_bitcount64(user_sgpr_info->inline_push_constant_mask); i++) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.inline_push_consts[i]);
}
args->ac.base_inline_push_consts = info->min_push_constant_used / 4;
args->ac.inline_push_const_mask = user_sgpr_info->inline_push_constant_mask;
if (info->so.num_outputs) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->streamout_buffers);
@ -451,8 +448,6 @@ static void
set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info,
uint8_t *user_sgpr_idx)
{
unsigned num_inline_push_consts = 0;
if (!user_sgpr_info->indirect_all_descriptor_sets) {
for (unsigned i = 0; i < ARRAY_SIZE(args->descriptor_sets); i++) {
if (args->descriptor_sets[i].used)
@ -466,13 +461,9 @@ set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info
set_loc_shader_ptr(args, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
}
for (unsigned i = 0; i < ARRAY_SIZE(args->ac.inline_push_consts); i++) {
if (args->ac.inline_push_consts[i].used)
num_inline_push_consts++;
}
if (num_inline_push_consts) {
set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, num_inline_push_consts);
if (user_sgpr_info->inline_push_constant_mask) {
set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
util_bitcount64(user_sgpr_info->inline_push_constant_mask));
}
if (args->streamout_buffers.used) {

View File

@ -96,22 +96,19 @@ static void
gather_push_constant_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
struct radv_shader_info *info)
{
int base = nir_intrinsic_base(instr);
info->loads_push_constants = true;
if (!nir_src_is_const(instr->src[0])) {
info->has_indirect_push_constants = true;
} else {
uint32_t min = base + nir_src_as_uint(instr->src[0]);
uint32_t max = min + instr->num_components * 4;
if (nir_src_is_const(instr->src[0]) && instr->dest.ssa.bit_size == 32) {
uint32_t start = (nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0])) / 4u;
uint32_t size = instr->num_components * (instr->dest.ssa.bit_size / 32u);
info->max_push_constant_used = MAX2(max, info->max_push_constant_used);
info->min_push_constant_used = MIN2(min, info->min_push_constant_used);
if (start + size <= (MAX_PUSH_CONSTANTS_SIZE / 4u)) {
info->inline_push_constant_mask |= u_bit_consecutive64(start, size);
return;
}
}
if (instr->dest.ssa.bit_size != 32)
info->has_only_32bit_push_constants = false;
info->loads_push_constants = true;
info->can_inline_all_push_constants = false;
}
static void
@ -621,9 +618,8 @@ assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask,
void
radv_nir_shader_info_init(struct radv_shader_info *info)
{
/* Assume that shaders only have 32-bit push constants by default. */
info->min_push_constant_used = UINT16_MAX;
info->has_only_32bit_push_constants = true;
/* Assume that shaders can inline all push constants by default. */
info->can_inline_all_push_constants = true;
}
void