421 lines
12 KiB
C
421 lines
12 KiB
C
/*
|
|
* Copyright © 2021 Valve Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "ir3_compiler.h"
|
|
#include "ir3_nir.h"
|
|
|
|
/* Preamble optimization happens in two parts: first we generate the preamble
|
|
* using the generic NIR pass, then we setup the preamble sequence and inline
|
|
* the preamble into the main shader if there was a preamble. The first part
|
|
* should happen before UBO lowering, because we want to prefer more complex
|
|
* expressions over UBO loads, but the second part has to happen after UBO
|
|
* lowering because it may add copy instructions to the preamble.
|
|
*/
|
|
|
|
static void
|
|
def_size(nir_ssa_def *def, unsigned *size, unsigned *align)
|
|
{
|
|
unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
|
|
/* Due to the implicit const file promotion we want to expand 16-bit values
|
|
* to 32-bit so that the truncation in the main shader can hopefully be
|
|
* folded into the use.
|
|
*/
|
|
*size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
|
|
*align = 1;
|
|
}
|
|
|
|
static bool
|
|
all_uses_float(nir_ssa_def *def, bool allow_src2)
|
|
{
|
|
nir_foreach_if_use (use, def) {
|
|
return false;
|
|
}
|
|
|
|
nir_foreach_use (use, def) {
|
|
nir_instr *use_instr = use->parent_instr;
|
|
if (use_instr->type != nir_instr_type_alu)
|
|
return false;
|
|
nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
|
|
unsigned src_index = ~0;
|
|
for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
|
|
if (&use_alu->src[i].src == use) {
|
|
src_index = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(src_index != ~0);
|
|
nir_alu_type src_type =
|
|
nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
|
|
|
|
if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
all_uses_bit(nir_ssa_def *def)
|
|
{
|
|
nir_foreach_if_use (use, def) {
|
|
return false;
|
|
}
|
|
|
|
nir_foreach_use (use, def) {
|
|
nir_instr *use_instr = use->parent_instr;
|
|
if (use_instr->type != nir_instr_type_alu)
|
|
return false;
|
|
nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
|
|
|
|
/* See ir3_cat2_absneg() */
|
|
switch (use_alu->op) {
|
|
case nir_op_iand:
|
|
case nir_op_ior:
|
|
case nir_op_inot:
|
|
case nir_op_ixor:
|
|
case nir_op_bitfield_reverse:
|
|
case nir_op_ufind_msb:
|
|
case nir_op_ifind_msb:
|
|
case nir_op_find_lsb:
|
|
case nir_op_ishl:
|
|
case nir_op_ushr:
|
|
case nir_op_ishr:
|
|
case nir_op_bit_count:
|
|
continue;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static float
|
|
instr_cost(nir_instr *instr, const void *data)
|
|
{
|
|
/* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
|
|
* take 1 (normalized) cycle.
|
|
*
|
|
* See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
|
|
*
|
|
* TODO: assume wave128 on fragment/compute shaders?
|
|
*/
|
|
|
|
switch (instr->type) {
|
|
case nir_instr_type_alu: {
|
|
nir_alu_instr *alu = nir_instr_as_alu(instr);
|
|
unsigned components = alu->dest.dest.ssa.num_components;
|
|
switch (alu->op) {
|
|
/* cat4 */
|
|
case nir_op_frcp:
|
|
case nir_op_fsqrt:
|
|
case nir_op_frsq:
|
|
case nir_op_flog2:
|
|
case nir_op_fexp2:
|
|
case nir_op_fsin:
|
|
case nir_op_fcos:
|
|
return 4 * components;
|
|
|
|
/* Instructions that become src modifiers. Note for conversions this is
|
|
* really an approximation.
|
|
*
|
|
* This prevents silly things like lifting a negate that would become a
|
|
* modifier.
|
|
*/
|
|
case nir_op_f2f32:
|
|
case nir_op_f2f16:
|
|
case nir_op_f2fmp:
|
|
case nir_op_fneg:
|
|
return all_uses_float(&alu->dest.dest.ssa, true) ? 0 : 1 * components;
|
|
|
|
case nir_op_fabs:
|
|
return all_uses_float(&alu->dest.dest.ssa, false) ? 0 : 1 * components;
|
|
|
|
case nir_op_inot:
|
|
return all_uses_bit(&alu->dest.dest.ssa) ? 0 : 1 * components;
|
|
|
|
/* Instructions that become vector split/collect */
|
|
case nir_op_vec2:
|
|
case nir_op_vec3:
|
|
case nir_op_vec4:
|
|
case nir_op_mov:
|
|
return 0;
|
|
|
|
/* cat1-cat3 */
|
|
default:
|
|
return 1 * components;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case nir_instr_type_tex:
|
|
/* cat5 */
|
|
return 8;
|
|
|
|
case nir_instr_type_intrinsic: {
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
switch (intrin->intrinsic) {
|
|
case nir_intrinsic_load_ubo: {
|
|
/* If the UBO and offset are constant, then UBO lowering should do a
|
|
* better job trying to lower this, and opt_preamble shouldn't try to
|
|
* duplicate it. However if it has a non-constant offset then we can
|
|
* avoid setting up a0.x etc. in the main shader and potentially have
|
|
* to push less.
|
|
*/
|
|
bool const_ubo = nir_src_is_const(intrin->src[0]);
|
|
if (!const_ubo) {
|
|
nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
|
|
if (rsrc)
|
|
const_ubo = nir_src_is_const(rsrc->src[0]);
|
|
}
|
|
|
|
if (const_ubo && nir_src_is_const(intrin->src[1]))
|
|
return 0;
|
|
|
|
/* TODO: get actual numbers for ldc */
|
|
return 8;
|
|
}
|
|
|
|
case nir_intrinsic_load_ssbo:
|
|
case nir_intrinsic_load_ssbo_ir3:
|
|
case nir_intrinsic_get_ssbo_size:
|
|
case nir_intrinsic_image_load:
|
|
case nir_intrinsic_bindless_image_load:
|
|
/* cat5/isam */
|
|
return 8;
|
|
|
|
/* By default assume it's a sysval or something */
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
static float
|
|
rewrite_cost(nir_ssa_def *def, const void *data)
|
|
{
|
|
/* We always have to expand booleans */
|
|
if (def->bit_size == 1)
|
|
return def->num_components;
|
|
|
|
bool mov_needed = false;
|
|
nir_foreach_use (use, def) {
|
|
nir_instr *parent_instr = use->parent_instr;
|
|
if (parent_instr->type != nir_instr_type_alu) {
|
|
mov_needed = true;
|
|
break;
|
|
} else {
|
|
nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
|
|
if (alu->op == nir_op_vec2 ||
|
|
alu->op == nir_op_vec3 ||
|
|
alu->op == nir_op_vec4 ||
|
|
alu->op == nir_op_mov) {
|
|
mov_needed = true;
|
|
break;
|
|
} else {
|
|
/* Assume for non-moves that the const is folded into the src */
|
|
}
|
|
}
|
|
}
|
|
|
|
return mov_needed ? def->num_components : 0;
|
|
}
|
|
|
|
static bool
|
|
avoid_instr(const nir_instr *instr, const void *data)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
|
|
}
|
|
|
|
bool
|
|
ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
|
|
{
|
|
struct ir3_const_state *const_state = ir3_const_state(v);
|
|
|
|
unsigned max_size;
|
|
if (v->binning_pass) {
|
|
max_size = const_state->preamble_size * 4;
|
|
} else {
|
|
struct ir3_const_state worst_case_const_state = {};
|
|
ir3_setup_const_state(nir, v, &worst_case_const_state);
|
|
max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4;
|
|
}
|
|
|
|
if (max_size == 0)
|
|
return false;
|
|
|
|
nir_opt_preamble_options options = {
|
|
.drawid_uniform = true,
|
|
.subgroup_size_uniform = true,
|
|
.def_size = def_size,
|
|
.preamble_storage_size = max_size,
|
|
.instr_cost_cb = instr_cost,
|
|
.avoid_instr_cb = avoid_instr,
|
|
.rewrite_cost_cb = rewrite_cost,
|
|
};
|
|
|
|
unsigned size;
|
|
bool progress = nir_opt_preamble(nir, &options, &size);
|
|
|
|
if (!v->binning_pass)
|
|
const_state->preamble_size = DIV_ROUND_UP(size, 4);
|
|
|
|
return progress;
|
|
}
|
|
|
|
bool
|
|
ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
|
|
{
|
|
nir_function_impl *main = nir_shader_get_entrypoint(nir);
|
|
|
|
if (!main->preamble)
|
|
return false;
|
|
|
|
nir_function_impl *preamble = main->preamble->impl;
|
|
|
|
/* First, lower load/store_preamble. */
|
|
const struct ir3_const_state *const_state = ir3_const_state(v);
|
|
unsigned preamble_base = v->num_reserved_user_consts * 4 +
|
|
const_state->ubo_state.size / 4;
|
|
unsigned preamble_size = const_state->preamble_size * 4;
|
|
|
|
BITSET_DECLARE(promoted_to_float, preamble_size);
|
|
memset(promoted_to_float, 0, sizeof(promoted_to_float));
|
|
|
|
nir_builder _b;
|
|
nir_builder *b = &_b;
|
|
nir_builder_init(b, main);
|
|
|
|
nir_foreach_block (block, main) {
|
|
nir_foreach_instr_safe (instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_load_preamble)
|
|
continue;
|
|
|
|
nir_ssa_def *dest = &intrin->dest.ssa;
|
|
|
|
unsigned offset = preamble_base + nir_intrinsic_base(intrin);
|
|
b->cursor = nir_before_instr(instr);
|
|
|
|
nir_ssa_def *new_dest =
|
|
nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0),
|
|
.base = offset);
|
|
|
|
if (dest->bit_size == 1) {
|
|
new_dest = nir_i2b1(b, new_dest);
|
|
} else if (dest->bit_size != 32) {
|
|
assert(dest->bit_size == 16);
|
|
if (all_uses_float(dest, true)) {
|
|
new_dest = nir_f2f16(b, new_dest);
|
|
BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
|
|
} else {
|
|
new_dest = nir_u2u16(b, new_dest);
|
|
}
|
|
}
|
|
|
|
nir_ssa_def_rewrite_uses(dest, new_dest);
|
|
nir_instr_remove(instr);
|
|
nir_instr_free(instr);
|
|
}
|
|
}
|
|
|
|
nir_builder_init(b, preamble);
|
|
|
|
nir_foreach_block (block, preamble) {
|
|
nir_foreach_instr_safe (instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_store_preamble)
|
|
continue;
|
|
|
|
nir_ssa_def *src = intrin->src[0].ssa;
|
|
unsigned offset = preamble_base + nir_intrinsic_base(intrin);
|
|
|
|
b->cursor = nir_before_instr(instr);
|
|
|
|
if (src->bit_size == 1)
|
|
src = nir_b2i32(b, src);
|
|
if (src->bit_size != 32) {
|
|
assert(src->bit_size == 16);
|
|
if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) {
|
|
src = nir_f2f32(b, src);
|
|
} else {
|
|
src = nir_u2u32(b, src);
|
|
}
|
|
}
|
|
|
|
nir_store_uniform_ir3(b, src, .base = offset);
|
|
nir_instr_remove(instr);
|
|
nir_instr_free(instr);
|
|
}
|
|
}
|
|
|
|
/* Now, create the preamble sequence and move the preamble into the main
|
|
* shader:
|
|
*
|
|
* if (preamble_start_ir3()) {
|
|
* if (subgroupElect()) {
|
|
* preamble();
|
|
* preamble_end_ir3();
|
|
* }
|
|
* }
|
|
* ...
|
|
*/
|
|
|
|
b->cursor = nir_before_cf_list(&main->body);
|
|
|
|
nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
|
|
{
|
|
nir_if *inner_if = nir_push_if(b, nir_elect(b, 1));
|
|
{
|
|
nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
|
|
nir_builder_instr_insert(b, &call->instr);
|
|
nir_preamble_end_ir3(b);
|
|
}
|
|
nir_pop_if(b, inner_if);
|
|
}
|
|
nir_pop_if(b, outer_if);
|
|
|
|
nir_inline_functions(nir);
|
|
exec_node_remove(&main->preamble->node);
|
|
main->preamble = NULL;
|
|
|
|
nir_metadata_preserve(main, nir_metadata_none);
|
|
return true;
|
|
}
|