ir3: Add preamble optimization pass

Now that everything is plumbed through, we can tie it together.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13148>
This commit is contained in:
Connor Abbott 2021-09-24 19:04:04 +02:00 committed by Marge Bot
parent 986f7adfee
commit fccc35c2de
8 changed files with 455 additions and 2 deletions

View File

@ -45,6 +45,7 @@ static const struct debug_named_value shader_debug_options[] = {
{"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"},
{"nocache", IR3_DBG_NOCACHE, "Disable shader cache"},
{"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"},
{"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
#ifdef DEBUG
/* DEBUG-only options: */
{"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"},
@ -245,6 +246,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
/* TODO: implement private memory on earlier gen's */
compiler->has_pvtmem = true;
compiler->has_preamble = true;
compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
compiler->storage_16bit = dev_info->a6xx.storage_16bit;

View File

@ -182,6 +182,9 @@ struct ir3_compiler {
* constbuf. a5xx+ has the shared regfile.
*/
bool has_shared_regfile;
/* True if preamble instructions (shps, shpe, etc.) are supported */
bool has_preamble;
};
void ir3_compiler_destroy(struct ir3_compiler *compiler);
@ -224,6 +227,7 @@ enum ir3_shader_debug {
IR3_DBG_NOFP16 = BITFIELD_BIT(10),
IR3_DBG_NOCACHE = BITFIELD_BIT(11),
IR3_DBG_SPILLALL = BITFIELD_BIT(12),
IR3_DBG_NOPREAMBLE = BITFIELD_BIT(13),
/* DEBUG-only options: */
IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),

View File

@ -640,11 +640,28 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
progress |= OPT(s, ir3_nir_lower_64b_undef);
progress |= OPT(s, nir_lower_int64);
/* Cleanup code leftover from lowering passes before opt_preamble */
if (progress) {
progress |= OPT(s, nir_opt_constant_folding);
}
/* Do the preamble before analysing UBO ranges, because it's usually
* higher-value and because it can result in eliminating some indirect UBO
* accesses where otherwise we'd have to push the whole range. However we
* have to lower the preamble after UBO lowering so that UBO lowering can
* insert instructions in the preamble to push UBOs.
*/
if (so->shader->compiler->has_preamble &&
!(ir3_shader_debug & IR3_DBG_NOPREAMBLE))
progress |= OPT(s, ir3_nir_opt_preamble, so);
if (!so->binning_pass)
OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
progress |= OPT(s, ir3_nir_lower_preamble, so);
OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
/* UBO offset lowering has to come after we've decided what will
@ -826,7 +843,8 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
debug_assert((const_state->ubo_state.size % 16) == 0);
unsigned constoff = v->shader->num_reserved_user_consts +
const_state->ubo_state.size / 16;
const_state->ubo_state.size / 16 +
const_state->preamble_size;
unsigned ptrsz = ir3_pointer_size(compiler);
if (const_state->num_ubos > 0) {

View File

@ -73,6 +73,8 @@ bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
bool ir3_nir_fixup_load_uniform(nir_shader *nir);
bool ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v);
bool ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v);
nir_ssa_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
nir_ssa_def *offset,

View File

@ -369,7 +369,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
* allocation of the driver params' const space, because UBO pointers can
* be driver params but this pass usually eliminatings them.
*/
struct ir3_const_state worst_case_const_state = {};
struct ir3_const_state worst_case_const_state = {
.preamble_size = const_state->preamble_size,
};
ir3_setup_const_state(nir, v, &worst_case_const_state);
const uint32_t max_upload =
(ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 16;

View File

@ -0,0 +1,420 @@
/*
* Copyright © 2021 Valve Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "ir3_compiler.h"
#include "ir3_nir.h"
/* Preamble optimization happens in two parts: first we generate the preamble
* using the generic NIR pass, then we setup the preamble sequence and inline
* the preamble into the main shader if there was a preamble. The first part
* should happen before UBO lowering, because we want to prefer more complex
* expressions over UBO loads, but the second part has to happen after UBO
* lowering because it may add copy instructions to the preamble.
*/
static void
def_size(nir_ssa_def *def, unsigned *size, unsigned *align)
{
unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
/* Due to the implicit const file promotion we want to expand 16-bit values
* to 32-bit so that the truncation in the main shader can hopefully be
* folded into the use.
*/
*size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
*align = 1;
}
static bool
all_uses_float(nir_ssa_def *def, bool allow_src2)
{
nir_foreach_if_use (use, def) {
return false;
}
nir_foreach_use (use, def) {
nir_instr *use_instr = use->parent_instr;
if (use_instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
unsigned src_index = ~0;
for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
if (&use_alu->src[i].src == use) {
src_index = i;
break;
}
}
assert(src_index != ~0);
nir_alu_type src_type =
nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
return false;
}
return true;
}
static bool
all_uses_bit(nir_ssa_def *def)
{
nir_foreach_if_use (use, def) {
return false;
}
nir_foreach_use (use, def) {
nir_instr *use_instr = use->parent_instr;
if (use_instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
/* See ir3_cat2_absneg() */
switch (use_alu->op) {
case nir_op_iand:
case nir_op_ior:
case nir_op_inot:
case nir_op_ixor:
case nir_op_bitfield_reverse:
case nir_op_ufind_msb:
case nir_op_ifind_msb:
case nir_op_find_lsb:
case nir_op_ishl:
case nir_op_ushr:
case nir_op_ishr:
case nir_op_bit_count:
continue;
default:
return false;
}
}
return true;
}
static float
instr_cost(nir_instr *instr, const void *data)
{
/* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
* take 1 (normalized) cycle.
*
* See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
*
* TODO: assume wave128 on fragment/compute shaders?
*/
switch (instr->type) {
case nir_instr_type_alu: {
nir_alu_instr *alu = nir_instr_as_alu(instr);
unsigned components = alu->dest.dest.ssa.num_components;
switch (alu->op) {
/* cat4 */
case nir_op_frcp:
case nir_op_fsqrt:
case nir_op_frsq:
case nir_op_flog2:
case nir_op_fexp2:
case nir_op_fsin:
case nir_op_fcos:
return 4 * components;
/* Instructions that become src modifiers. Note for conversions this is
* really an approximation.
*
* This prevents silly things like lifting a negate that would become a
* modifier.
*/
case nir_op_f2f32:
case nir_op_f2f16:
case nir_op_f2fmp:
case nir_op_fneg:
return all_uses_float(&alu->dest.dest.ssa, true) ? 0 : 1 * components;
case nir_op_fabs:
return all_uses_float(&alu->dest.dest.ssa, false) ? 0 : 1 * components;
case nir_op_inot:
return all_uses_bit(&alu->dest.dest.ssa) ? 0 : 1 * components;
/* Instructions that become vector split/collect */
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
case nir_op_mov:
return 0;
/* cat1-cat3 */
default:
return 1 * components;
}
break;
}
case nir_instr_type_tex:
/* cat5 */
return 8;
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo: {
/* If the UBO and offset are constant, then UBO lowering should do a
* better job trying to lower this, and opt_preamble shouldn't try to
* duplicate it. However if it has a non-constant offset then we can
* avoid setting up a0.x etc. in the main shader and potentially have
* to push less.
*/
bool const_ubo = nir_src_is_const(intrin->src[0]);
if (!const_ubo) {
nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
if (rsrc)
const_ubo = nir_src_is_const(rsrc->src[0]);
}
if (const_ubo && nir_src_is_const(intrin->src[1]))
return 0;
/* TODO: get actual numbers for ldc */
return 8;
}
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_ssbo_ir3:
case nir_intrinsic_get_ssbo_size:
case nir_intrinsic_image_load:
case nir_intrinsic_bindless_image_load:
/* cat5/isam */
return 8;
/* By default assume it's a sysval or something */
default:
return 0;
}
}
default:
return 0;
}
}
static float
rewrite_cost(nir_ssa_def *def, const void *data)
{
/* We always have to expand booleans */
if (def->bit_size == 1)
return def->num_components;
bool mov_needed = false;
nir_foreach_use (use, def) {
nir_instr *parent_instr = use->parent_instr;
if (parent_instr->type != nir_instr_type_alu) {
mov_needed = true;
break;
} else {
nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
if (alu->op == nir_op_vec2 ||
alu->op == nir_op_vec3 ||
alu->op == nir_op_vec4 ||
alu->op == nir_op_mov) {
mov_needed = true;
break;
} else {
/* Assume for non-moves that the const is folded into the src */
}
}
}
return mov_needed ? def->num_components : 0;
}
static bool
avoid_instr(const nir_instr *instr, const void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
}
bool
ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
{
struct ir3_const_state *const_state = ir3_const_state(v);
unsigned max_size;
if (v->binning_pass) {
max_size = const_state->preamble_size * 4;
} else {
struct ir3_const_state worst_case_const_state = {};
ir3_setup_const_state(nir, v, &worst_case_const_state);
max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4;
}
if (max_size == 0)
return false;
nir_opt_preamble_options options = {
.drawid_uniform = true,
.subgroup_size_uniform = true,
.def_size = def_size,
.preamble_storage_size = max_size,
.instr_cost_cb = instr_cost,
.avoid_instr_cb = avoid_instr,
.rewrite_cost_cb = rewrite_cost,
};
unsigned size;
bool progress = nir_opt_preamble(nir, &options, &size);
if (!v->binning_pass)
const_state->preamble_size = DIV_ROUND_UP(size, 4);
return progress;
}
bool
ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
{
nir_function_impl *main = nir_shader_get_entrypoint(nir);
if (!main->preamble)
return false;
nir_function_impl *preamble = main->preamble->impl;
/* First, lower load/store_preamble. */
const struct ir3_const_state *const_state = ir3_const_state(v);
unsigned preamble_base = v->shader->num_reserved_user_consts * 4 +
const_state->ubo_state.size / 4;
unsigned preamble_size = const_state->preamble_size * 4;
BITSET_DECLARE(promoted_to_float, preamble_size);
memset(promoted_to_float, 0, sizeof(promoted_to_float));
nir_builder _b;
nir_builder *b = &_b;
nir_builder_init(b, main);
nir_foreach_block (block, main) {
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_preamble)
continue;
nir_ssa_def *dest = &intrin->dest.ssa;
unsigned offset = preamble_base + nir_intrinsic_base(intrin);
b->cursor = nir_before_instr(instr);
nir_ssa_def *new_dest =
nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0),
.base = offset);
if (dest->bit_size == 1) {
new_dest = nir_i2b1(b, new_dest);
} else if (dest->bit_size != 32) {
assert(dest->bit_size == 16);
if (all_uses_float(dest, true)) {
new_dest = nir_f2f16(b, new_dest);
BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
} else {
new_dest = nir_u2u16(b, new_dest);
}
}
nir_ssa_def_rewrite_uses(dest, new_dest);
nir_instr_remove(instr);
nir_instr_free(instr);
}
}
nir_builder_init(b, preamble);
nir_foreach_block (block, preamble) {
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_preamble)
continue;
nir_ssa_def *src = intrin->src[0].ssa;
unsigned offset = preamble_base + nir_intrinsic_base(intrin);
b->cursor = nir_before_instr(instr);
if (src->bit_size == 1)
src = nir_b2i32(b, src);
if (src->bit_size != 32) {
assert(src->bit_size == 16);
if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) {
src = nir_f2f32(b, src);
} else {
src = nir_u2u32(b, src);
}
}
nir_store_uniform_ir3(b, src, .base = offset);
nir_instr_remove(instr);
nir_instr_free(instr);
}
}
/* Now, create the preamble sequence and move the preamble into the main
* shader:
*
* if (preamble_start_ir3()) {
* if (subgroupElect()) {
* preamble();
* preamble_end_ir3();
* }
* }
* ...
*/
b->cursor = nir_before_cf_list(&main->body);
nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
{
nir_if *inner_if = nir_push_if(b, nir_elect(b, 1));
{
nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
nir_builder_instr_insert(b, &call->instr);
nir_preamble_end_ir3(b);
}
nir_pop_if(b, inner_if);
}
nir_pop_if(b, outer_if);
nir_inline_functions(nir);
exec_node_remove(&main->preamble->node);
main->preamble = NULL;
nir_metadata_preserve(main, nir_metadata_none);
return true;
}

View File

@ -157,6 +157,7 @@ struct ir3_ubo_analysis_state {
* that pointer size (ubo, etc) changes depending on generation.
*
* user consts
* preamble consts
* UBO addresses
* SSBO sizes
* image dimensions
@ -209,6 +210,8 @@ struct ir3_const_state {
unsigned immediates_size;
uint32_t *immediates;
unsigned preamble_size;
/* State of ubo access lowered to push consts: */
struct ir3_ubo_analysis_state ubo_state;
};

View File

@ -102,6 +102,7 @@ libfreedreno_ir3_files = files(
'ir3_nir_lower_tex_prefetch.c',
'ir3_nir_lower_wide_load_store.c',
'ir3_nir_move_varying_inputs.c',
'ir3_nir_opt_preamble.c',
'ir3_postsched.c',
'ir3_print.c',
'ir3_ra.c',