mesa/src/compiler/nir/nir_lower_shader_calls.c

1225 lines
44 KiB
C

/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "nir.h"
#include "nir_builder.h"
#include "nir_phi_builder.h"
#include "util/u_math.h"
static bool
move_system_values_to_top(nir_shader *shader)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
bool progress = false;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
/* These intrinsics not only can't be re-materialized but aren't
* preserved when moving to the continuation shader. We have to move
* them to the top to ensure they get spilled as needed.
*/
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_shader_record_ptr:
case nir_intrinsic_load_btd_local_arg_addr_intel:
nir_instr_remove(instr);
nir_instr_insert(nir_before_cf_list(&impl->body), instr);
progress = true;
break;
default:
break;
}
}
}
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
} else {
nir_metadata_preserve(impl, nir_metadata_all);
}
return progress;
}
static bool
instr_is_shader_call(nir_instr *instr)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
return intrin->intrinsic == nir_intrinsic_trace_ray ||
intrin->intrinsic == nir_intrinsic_report_ray_intersection ||
intrin->intrinsic == nir_intrinsic_execute_callable;
}
/* Previously named bitset, it had to be renamed as FreeBSD defines a struct
* named bitset in sys/_bitset.h required by pthread_np.h which is included
* from src/util/u_thread.h that is indirectly included by this file.
*/
struct brw_bitset {
BITSET_WORD *set;
unsigned size;
};
static struct brw_bitset
bitset_create(void *mem_ctx, unsigned size)
{
return (struct brw_bitset) {
.set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)),
.size = size,
};
}
static bool
src_is_in_bitset(nir_src *src, void *_set)
{
struct brw_bitset *set = _set;
assert(src->is_ssa);
/* Any SSA values which were added after we generated liveness information
* are things generated by this pass and, while most of it is arithmetic
* which we could re-materialize, we don't need to because it's only used
* for a single load/store and so shouldn't cross any shader calls.
*/
if (src->ssa->index >= set->size)
return false;
return BITSET_TEST(set->set, src->ssa->index);
}
static void
add_ssa_def_to_bitset(nir_ssa_def *def, struct brw_bitset *set)
{
if (def->index >= set->size)
return;
BITSET_SET(set->set, def->index);
}
static bool
can_remat_instr(nir_instr *instr, struct brw_bitset *remat)
{
/* Set of all values which are trivially re-materializable and we shouldn't
* ever spill them. This includes:
*
* - Undef values
* - Constants
* - Uniforms (UBO or push constant)
* - ALU combinations of any of the above
* - Derefs which are either complete or casts of any of the above
*
* Because this pass rewrites things in-order and phis are always turned
* into register writes, We can use "is it SSA?" to answer the question
* "can my source be re-materialized?".
*/
switch (instr->type) {
case nir_instr_type_alu:
if (!nir_instr_as_alu(instr)->dest.dest.is_ssa)
return false;
return nir_foreach_src(instr, src_is_in_bitset, remat);
case nir_instr_type_deref:
return nir_foreach_src(instr, src_is_in_bitset, remat);
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_vulkan_resource_index:
case nir_intrinsic_vulkan_resource_reindex:
case nir_intrinsic_load_vulkan_descriptor:
case nir_intrinsic_load_push_constant:
/* These intrinsics don't need to be spilled as long as they don't
* depend on any spilled values.
*/
return nir_foreach_src(instr, src_is_in_bitset, remat);
case nir_intrinsic_load_scratch_base_ptr:
case nir_intrinsic_load_ray_launch_id:
case nir_intrinsic_load_topology_id_intel:
case nir_intrinsic_load_btd_global_arg_addr_intel:
case nir_intrinsic_load_btd_resume_sbt_addr_intel:
case nir_intrinsic_load_ray_base_mem_addr_intel:
case nir_intrinsic_load_ray_hw_stack_size_intel:
case nir_intrinsic_load_ray_sw_stack_size_intel:
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
case nir_intrinsic_load_ray_hit_sbt_addr_intel:
case nir_intrinsic_load_ray_hit_sbt_stride_intel:
case nir_intrinsic_load_ray_miss_sbt_addr_intel:
case nir_intrinsic_load_ray_miss_sbt_stride_intel:
case nir_intrinsic_load_callable_sbt_addr_intel:
case nir_intrinsic_load_callable_sbt_stride_intel:
case nir_intrinsic_load_reloc_const_intel:
case nir_intrinsic_load_ray_query_global_intel:
/* Notably missing from the above list is btd_local_arg_addr_intel.
* This is because the resume shader will have a different local
* argument pointer because it has a different BSR. Any access of
* the original shader's local arguments needs to be preserved so
* that pointer has to be saved on the stack.
*
* TODO: There may be some system values we want to avoid
* re-materializing as well but we have to be very careful
* to ensure that it's a system value which cannot change
* across a shader call.
*/
return true;
default:
return false;
}
}
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
return true;
default:
return false;
}
}
static bool
can_remat_ssa_def(nir_ssa_def *def, struct brw_bitset *remat)
{
return can_remat_instr(def->parent_instr, remat);
}
static nir_ssa_def *
remat_ssa_def(nir_builder *b, nir_ssa_def *def)
{
nir_instr *clone = nir_instr_clone(b->shader, def->parent_instr);
nir_builder_instr_insert(b, clone);
return nir_instr_ssa_def(clone);
}
struct pbv_array {
struct nir_phi_builder_value **arr;
unsigned len;
};
static struct nir_phi_builder_value *
get_phi_builder_value_for_def(nir_ssa_def *def,
struct pbv_array *pbv_arr)
{
if (def->index >= pbv_arr->len)
return NULL;
return pbv_arr->arr[def->index];
}
static nir_ssa_def *
get_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr,
nir_block *block)
{
assert(src->is_ssa);
struct nir_phi_builder_value *pbv =
get_phi_builder_value_for_def(src->ssa, pbv_arr);
if (pbv == NULL)
return NULL;
return nir_phi_builder_value_get_block_def(pbv, block);
}
static bool
rewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr)
{
nir_block *block;
if (src->parent_instr->type == nir_instr_type_phi) {
nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src);
block = phi_src->pred;
} else {
block = src->parent_instr->block;
}
nir_ssa_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block);
if (new_def != NULL)
nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new_def));
return true;
}
static nir_ssa_def *
spill_fill(nir_builder *before, nir_builder *after, nir_ssa_def *def, unsigned offset,
nir_address_format address_format, unsigned stack_alignment)
{
const unsigned comp_size = def->bit_size / 8;
switch(address_format) {
case nir_address_format_32bit_offset:
nir_store_scratch(before, def, nir_imm_int(before, offset),
.align_mul = MIN2(comp_size, stack_alignment),
.write_mask = BITFIELD_MASK(def->num_components));
def = nir_load_scratch(after, def->num_components, def->bit_size,
nir_imm_int(after, offset), .align_mul = MIN2(comp_size, stack_alignment));
break;
case nir_address_format_64bit_global: {
nir_ssa_def *addr = nir_iadd_imm(before, nir_load_scratch_base_ptr(before, 1, 64, 1), offset);
nir_store_global(before, addr, MIN2(comp_size, stack_alignment), def, ~0);
addr = nir_iadd_imm(after, nir_load_scratch_base_ptr(after, 1, 64, 1), offset);
def = nir_load_global(after, addr, MIN2(comp_size, stack_alignment),
def->num_components, def->bit_size);
break;
}
default:
unreachable("Unimplemented address format");
}
return def;
}
static void
spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls,
nir_address_format address_format,
unsigned stack_alignment)
{
/* TODO: If a SSA def is filled more than once, we probably want to just
* spill it at the LCM of the fill sites so we avoid unnecessary
* extra spills
*
* TODO: If a SSA def is defined outside a loop but live through some call
* inside the loop, we probably want to spill outside the loop. We
* may also want to fill outside the loop if it's not used in the
* loop.
*
* TODO: Right now, we only re-materialize things if their immediate
* sources are things which we filled. We probably want to expand
* that to re-materialize things whose sources are things we can
* re-materialize from things we filled. We may want some DAG depth
* heuristic on this.
*/
/* This happens per-shader rather than per-impl because we mess with
* nir_shader::scratch_size.
*/
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_metadata_require(impl, nir_metadata_live_ssa_defs |
nir_metadata_dominance |
nir_metadata_block_index);
void *mem_ctx = ralloc_context(shader);
const unsigned num_ssa_defs = impl->ssa_alloc;
const unsigned live_words = BITSET_WORDS(num_ssa_defs);
struct brw_bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs);
/* Array of all live SSA defs which are spill candidates */
nir_ssa_def **spill_defs =
rzalloc_array(mem_ctx, nir_ssa_def *, num_ssa_defs);
/* For each spill candidate, an array of every time it's defined by a fill,
* indexed by call instruction index.
*/
nir_ssa_def ***fill_defs =
rzalloc_array(mem_ctx, nir_ssa_def **, num_ssa_defs);
/* For each call instruction, the liveness set at the call */
const BITSET_WORD **call_live =
rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls);
/* For each call instruction, the block index of the block it lives in */
uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls);
/* Walk the call instructions and fetch the liveness set and block index
* for each one. We need to do this before we start modifying the shader
* so that liveness doesn't complain that it's been invalidated. Don't
* worry, we'll be very careful with our live sets. :-)
*/
unsigned call_idx = 0;
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (!instr_is_shader_call(instr))
continue;
call_block_indices[call_idx] = block->index;
/* The objective here is to preserve values around shader call
* instructions. Therefore, we use the live set after the
* instruction as the set of things we want to preserve. Because
* none of our shader call intrinsics return anything, we don't have
* to worry about spilling over a return value.
*
* TODO: This isn't quite true for report_intersection.
*/
call_live[call_idx] =
nir_get_live_ssa_defs(nir_after_instr(instr), mem_ctx);
call_idx++;
}
}
nir_builder before, after;
nir_builder_init(&before, impl);
nir_builder_init(&after, impl);
call_idx = 0;
unsigned max_scratch_size = shader->scratch_size;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
nir_ssa_def *def = nir_instr_ssa_def(instr);
if (def != NULL) {
if (can_remat_ssa_def(def, &trivial_remat)) {
add_ssa_def_to_bitset(def, &trivial_remat);
} else {
spill_defs[def->index] = def;
}
}
if (!instr_is_shader_call(instr))
continue;
const BITSET_WORD *live = call_live[call_idx];
/* Make a copy of trivial_remat that we'll update as we crawl through
* the live SSA defs and unspill them.
*/
struct brw_bitset remat = bitset_create(mem_ctx, num_ssa_defs);
memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD));
/* Before the two builders are always separated by the call
* instruction, it won't break anything to have two of them.
*/
before.cursor = nir_before_instr(instr);
after.cursor = nir_after_instr(instr);
unsigned offset = shader->scratch_size;
for (unsigned w = 0; w < live_words; w++) {
BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w];
while (spill_mask) {
int i = u_bit_scan(&spill_mask);
assert(i >= 0);
unsigned index = w * BITSET_WORDBITS + i;
assert(index < num_ssa_defs);
nir_ssa_def *def = spill_defs[index];
if (can_remat_ssa_def(def, &remat)) {
/* If this SSA def is re-materializable or based on other
* things we've already spilled, re-materialize it rather
* than spilling and filling. Anything which is trivially
* re-materializable won't even get here because we take
* those into account in spill_mask above.
*/
def = remat_ssa_def(&after, def);
} else {
bool is_bool = def->bit_size == 1;
if (is_bool)
def = nir_b2b32(&before, def);
const unsigned comp_size = def->bit_size / 8;
offset = ALIGN(offset, comp_size);
def = spill_fill(&before, &after, def, offset,
address_format,stack_alignment);
if (is_bool)
def = nir_b2b1(&after, def);
offset += def->num_components * comp_size;
}
/* Mark this SSA def as available in the remat set so that, if
* some other SSA def we need is computed based on it, we can
* just re-compute instead of fetching from memory.
*/
BITSET_SET(remat.set, index);
/* For now, we just make a note of this new SSA def. We'll
* fix things up with the phi builder as a second pass.
*/
if (fill_defs[index] == NULL) {
fill_defs[index] =
rzalloc_array(mem_ctx, nir_ssa_def *, num_calls);
}
fill_defs[index][call_idx] = def;
}
}
nir_builder *b = &before;
offset = ALIGN(offset, stack_alignment);
max_scratch_size = MAX2(max_scratch_size, offset);
/* First thing on the called shader's stack is the resume address
* followed by a pointer to the payload.
*/
nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
/* Lower to generic intrinsics with information about the stack & resume shader. */
switch (call->intrinsic) {
case nir_intrinsic_trace_ray: {
nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa,
call->src[2].ssa, call->src[3].ssa,
call->src[4].ssa, call->src[5].ssa,
call->src[6].ssa, call->src[7].ssa,
call->src[8].ssa, call->src[9].ssa,
call->src[10].ssa,
.call_idx = call_idx, .stack_size = offset);
break;
}
case nir_intrinsic_report_ray_intersection:
unreachable("Any-hit shaders must be inlined");
case nir_intrinsic_execute_callable: {
nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset);
break;
}
default:
unreachable("Invalid shader call instruction");
}
nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset);
nir_instr_remove(&call->instr);
call_idx++;
}
}
assert(call_idx == num_calls);
shader->scratch_size = max_scratch_size;
struct nir_phi_builder *pb = nir_phi_builder_create(impl);
struct pbv_array pbv_arr = {
.arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *,
num_ssa_defs),
.len = num_ssa_defs,
};
const unsigned block_words = BITSET_WORDS(impl->num_blocks);
BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words);
/* Go through and set up phi builder values for each spillable value which
* we ever needed to spill at any point.
*/
for (unsigned index = 0; index < num_ssa_defs; index++) {
if (fill_defs[index] == NULL)
continue;
nir_ssa_def *def = spill_defs[index];
memset(def_blocks, 0, block_words * sizeof(BITSET_WORD));
BITSET_SET(def_blocks, def->parent_instr->block->index);
for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
if (fill_defs[index][call_idx] != NULL)
BITSET_SET(def_blocks, call_block_indices[call_idx]);
}
pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components,
def->bit_size, def_blocks);
}
/* Walk the shader one more time and rewrite SSA defs as needed using the
* phi builder.
*/
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
nir_ssa_def *def = nir_instr_ssa_def(instr);
if (def != NULL) {
struct nir_phi_builder_value *pbv =
get_phi_builder_value_for_def(def, &pbv_arr);
if (pbv != NULL)
nir_phi_builder_value_set_block_def(pbv, block, def);
}
if (instr->type == nir_instr_type_phi)
continue;
nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr);
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
if (resume->intrinsic != nir_intrinsic_rt_resume)
continue;
call_idx = nir_intrinsic_call_idx(resume);
/* Technically, this is the wrong place to add the fill defs to the
* phi builder values because we haven't seen any of the load_scratch
* instructions for this call yet. However, we know based on how we
* emitted them that no value ever gets used until after the load
* instruction has been emitted so this should be safe. If we ever
* fail validation due this it likely means a bug in our spilling
* code and not the phi re-construction code here.
*/
for (unsigned index = 0; index < num_ssa_defs; index++) {
if (fill_defs[index] && fill_defs[index][call_idx]) {
nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block,
fill_defs[index][call_idx]);
}
}
}
nir_if *following_if = nir_block_get_following_if(block);
if (following_if) {
nir_ssa_def *new_def =
get_phi_builder_def_for_src(&following_if->condition,
&pbv_arr, block);
if (new_def != NULL)
nir_if_rewrite_condition(following_if, nir_src_for_ssa(new_def));
}
/* Handle phi sources that source from this block. We have to do this
* as a separate pass because the phi builder assumes that uses and
* defs are processed in an order that respects dominance. When we have
* loops, a phi source may be a back-edge so we have to handle it as if
* it were one of the last instructions in the predecessor block.
*/
nir_foreach_phi_src_leaving_block(block,
rewrite_instr_src_from_phi_builder,
&pbv_arr);
}
nir_phi_builder_finish(pb);
ralloc_free(mem_ctx);
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
}
static nir_instr *
find_resume_instr(nir_function_impl *impl, unsigned call_idx)
{
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
if (resume->intrinsic != nir_intrinsic_rt_resume)
continue;
if (nir_intrinsic_call_idx(resume) == call_idx)
return &resume->instr;
}
}
unreachable("Couldn't find resume instruction");
}
/* Walk the CF tree and duplicate the contents of every loop, one half runs on
* resume and the other half is for any post-resume loop iterations. We are
* careful in our duplication to ensure that resume_instr is in the resume
* half of the loop though a copy of resume_instr will remain in the other
* half as well in case the same shader call happens twice.
*/
static bool
duplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr)
{
nir_register *resume_reg = NULL;
for (nir_cf_node *node = resume_instr->block->cf_node.parent;
node->type != nir_cf_node_function; node = node->parent) {
if (node->type != nir_cf_node_loop)
continue;
nir_loop *loop = nir_cf_node_as_loop(node);
if (resume_reg == NULL) {
/* We only create resume_reg if we encounter a loop. This way we can
* avoid re-validating the shader and calling ssa_to_regs in the case
* where it's just if-ladders.
*/
resume_reg = nir_local_reg_create(impl);
resume_reg->num_components = 1;
resume_reg->bit_size = 1;
nir_builder b;
nir_builder_init(&b, impl);
/* Initialize resume to true */
b.cursor = nir_before_cf_list(&impl->body);
nir_store_reg(&b, resume_reg, nir_imm_true(&b), 1);
/* Set resume to false right after the resume instruction */
b.cursor = nir_after_instr(resume_instr);
nir_store_reg(&b, resume_reg, nir_imm_false(&b), 1);
}
/* Before we go any further, make sure that everything which exits the
* loop or continues around to the top of the loop does so through
* registers. We're about to duplicate the loop body and we'll have
* serious trouble if we don't do this.
*/
nir_convert_loop_to_lcssa(loop);
nir_lower_phis_to_regs_block(nir_loop_first_block(loop));
nir_lower_phis_to_regs_block(
nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
nir_cf_list cf_list;
nir_cf_list_extract(&cf_list, &loop->body);
nir_if *_if = nir_if_create(impl->function->shader);
_if->condition = nir_src_for_reg(resume_reg);
nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node);
nir_cf_list clone;
nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL);
/* Insert the clone in the else and the original in the then so that
* the resume_instr remains valid even after the duplication.
*/
nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list));
nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list));
}
if (resume_reg != NULL)
nir_metadata_preserve(impl, nir_metadata_none);
return resume_reg != NULL;
}
static bool
cf_node_contains_block(nir_cf_node *node, nir_block *block)
{
for (nir_cf_node *n = &block->cf_node; n != NULL; n = n->parent) {
if (n == node)
return true;
}
return false;
}
static void
rewrite_phis_to_pred(nir_block *block, nir_block *pred)
{
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_phi)
break;
nir_phi_instr *phi = nir_instr_as_phi(instr);
ASSERTED bool found = false;
nir_foreach_phi_src(phi_src, phi) {
if (phi_src->pred == pred) {
found = true;
assert(phi_src->src.is_ssa);
nir_ssa_def_rewrite_uses(&phi->dest.ssa, phi_src->src.ssa);
break;
}
}
assert(found);
}
}
static bool
cursor_is_after_jump(nir_cursor cursor)
{
switch (cursor.option) {
case nir_cursor_before_instr:
case nir_cursor_before_block:
return false;
case nir_cursor_after_instr:
return cursor.instr->type == nir_instr_type_jump;
case nir_cursor_after_block:
return nir_block_ends_in_jump(cursor.block);;
}
unreachable("Invalid cursor option");
}
/** Flattens if ladders leading up to a resume
*
* Given a resume_instr, this function flattens any if ladders leading to the
* resume instruction and deletes any code that cannot be encountered on a
* direct path to the resume instruction. This way we get, for the most part,
* straight-line control-flow up to the resume instruction.
*
* While we do this flattening, we also move any code which is in the remat
* set up to the top of the function or to the top of the resume portion of
* the current loop. We don't worry about control-flow as we do this because
* phis will never be in the remat set (see can_remat_instr) and so nothing
* control-dependent will ever need to be re-materialized. It is possible
* that this algorithm will preserve too many instructions by moving them to
* the top but we leave that for DCE to clean up. Any code not in the remat
* set is deleted because it's either unused in the continuation or else
* unspilled from a previous continuation and the unspill code is after the
* resume instruction.
*
* If, for instance, we have something like this:
*
* // block 0
* if (cond1) {
* // block 1
* } else {
* // block 2
* if (cond2) {
* // block 3
* resume;
* if (cond3) {
* // block 4
* }
* } else {
* // block 5
* }
* }
*
* then we know, because we know the resume instruction had to be encoutered,
* that cond1 = false and cond2 = true and we lower as follows:
*
* // block 0
* // block 2
* // block 3
* resume;
* if (cond3) {
* // block 4
* }
*
* As you can see, the code in blocks 1 and 5 was removed because there is no
* path from the start of the shader to the resume instruction which execute
* blocks 1 or 5. Any remat code from blocks 0, 2, and 3 is preserved and
* moved to the top. If the resume instruction is inside a loop then we know
* a priori that it is of the form
*
* loop {
* if (resume) {
* // Contents containing resume_instr
* } else {
* // Second copy of contents
* }
* }
*
* In this case, we only descend into the first half of the loop. The second
* half is left alone as that portion is only ever executed after the resume
* instruction.
*/
static bool
flatten_resume_if_ladder(nir_builder *b,
nir_cf_node *parent_node,
struct exec_list *child_list,
bool child_list_contains_cursor,
nir_instr *resume_instr,
struct brw_bitset *remat)
{
nir_cf_list cf_list;
/* If our child list contains the cursor instruction then we start out
* before the cursor instruction. We need to know this so that we can skip
* moving instructions which are already before the cursor.
*/
bool before_cursor = child_list_contains_cursor;
nir_cf_node *resume_node = NULL;
foreach_list_typed_safe(nir_cf_node, child, node, child_list) {
switch (child->type) {
case nir_cf_node_block: {
nir_block *block = nir_cf_node_as_block(child);
if (b->cursor.option == nir_cursor_before_block &&
b->cursor.block == block) {
assert(before_cursor);
before_cursor = false;
}
nir_foreach_instr_safe(instr, block) {
if ((b->cursor.option == nir_cursor_before_instr ||
b->cursor.option == nir_cursor_after_instr) &&
b->cursor.instr == instr) {
assert(nir_cf_node_is_first(&block->cf_node));
assert(before_cursor);
before_cursor = false;
continue;
}
if (instr == resume_instr)
goto found_resume;
if (!before_cursor && can_remat_instr(instr, remat)) {
nir_instr_remove(instr);
nir_instr_insert(b->cursor, instr);
b->cursor = nir_after_instr(instr);
nir_ssa_def *def = nir_instr_ssa_def(instr);
BITSET_SET(remat->set, def->index);
}
}
if (b->cursor.option == nir_cursor_after_block &&
b->cursor.block == block) {
assert(before_cursor);
before_cursor = false;
}
break;
}
case nir_cf_node_if: {
nir_if *_if = nir_cf_node_as_if(child);
/* Because of the dummy blocks inserted in the first if block of the
* loops, it's possible we find an empty if block that contains our
* cursor. At this point, the block should still be empty and we can
* just skip it and consider we're after the cursor.
*/
if (cf_node_contains_block(&_if->cf_node,
nir_cursor_current_block(b->cursor))) {
/* Some sanity checks to verify this is actually a dummy block */
assert(nir_src_as_bool(_if->condition) == true);
assert(nir_cf_list_is_empty_block(&_if->then_list));
assert(nir_cf_list_is_empty_block(&_if->else_list));
before_cursor = false;
break;
}
assert(!before_cursor);
if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->then_list,
false, resume_instr, remat)) {
resume_node = child;
rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
nir_if_last_then_block(_if));
goto found_resume;
}
if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->else_list,
false, resume_instr, remat)) {
resume_node = child;
rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
nir_if_last_else_block(_if));
goto found_resume;
}
break;
}
case nir_cf_node_loop: {
assert(!before_cursor);
nir_loop *loop = nir_cf_node_as_loop(child);
if (cf_node_contains_block(&loop->cf_node, resume_instr->block)) {
/* Thanks to our loop body duplication pass, every level of loop
* containing the resume instruction contains exactly three nodes:
* two blocks and an if. We don't want to lower away this if
* because it's the resume selection if. The resume half is
* always the then_list so that's what we want to flatten.
*/
nir_block *header = nir_loop_first_block(loop);
nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node));
nir_builder bl;
nir_builder_init(&bl, b->impl);
bl.cursor = nir_before_cf_list(&_if->then_list);
/* We want to place anything re-materialized from inside the loop
* at the top of the resume half of the loop.
*
* Because we're inside a loop, we might run into a break/continue
* instructions. We can't place those within a block of
* instructions, they need to be at the end of a block. So we
* build our own dummy block to place them.
*/
nir_push_if(&bl, nir_imm_true(&bl));
{
ASSERTED bool found =
flatten_resume_if_ladder(&bl, &_if->cf_node, &_if->then_list,
true, resume_instr, remat);
assert(found);
}
nir_pop_if(&bl, NULL);
resume_node = child;
goto found_resume;
} else {
ASSERTED bool found =
flatten_resume_if_ladder(b, &loop->cf_node, &loop->body,
false, resume_instr, remat);
assert(!found);
}
break;
}
case nir_cf_node_function:
unreachable("Unsupported CF node type");
}
}
assert(!before_cursor);
/* If we got here, we didn't find the resume node or instruction. */
return false;
found_resume:
/* If we got here then we found either the resume node or the resume
* instruction in this CF list.
*/
if (resume_node) {
/* If the resume instruction is buried in side one of our children CF
* nodes, resume_node now points to that child.
*/
if (resume_node->type == nir_cf_node_if) {
/* Thanks to the recursive call, all of the interesting contents of
* resume_node have been copied before the cursor. We just need to
* copy the stuff after resume_node.
*/
nir_cf_extract(&cf_list, nir_after_cf_node(resume_node),
nir_after_cf_list(child_list));
} else {
/* The loop contains its own cursor and still has useful stuff in it.
* We want to move everything after and including the loop to before
* the cursor.
*/
assert(resume_node->type == nir_cf_node_loop);
nir_cf_extract(&cf_list, nir_before_cf_node(resume_node),
nir_after_cf_list(child_list));
}
} else {
/* If we found the resume instruction in one of our blocks, grab
* everything after it in the entire list (not just the one block), and
* place it before the cursor instr.
*/
nir_cf_extract(&cf_list, nir_after_instr(resume_instr),
nir_after_cf_list(child_list));
}
if (cursor_is_after_jump(b->cursor)) {
/* If the resume instruction is in a loop, it's possible cf_list ends
* in a break or continue instruction, in which case we don't want to
* insert anything. It's also possible we have an early return if
* someone hasn't lowered those yet. In either case, nothing after that
* point executes in this context so we can delete it.
*/
nir_cf_delete(&cf_list);
} else {
b->cursor = nir_cf_reinsert(&cf_list, b->cursor);
}
if (!resume_node) {
/* We want the resume to be the first "interesting" instruction */
nir_instr_remove(resume_instr);
nir_instr_insert(nir_before_cf_list(&b->impl->body), resume_instr);
}
/* We've copied everything interesting out of this CF list to before the
* cursor. Delete everything else.
*/
if (child_list_contains_cursor) {
/* If the cursor is in child_list, then we're either a loop or function
* that contains the cursor. Cursors are always placed in a wrapper if
* (true) to deal with break/continue and early returns. We've already
* moved everything interesting inside the wrapper if and we want to
* remove whatever is left after it.
*/
nir_block *cursor_block = nir_cursor_current_block(b->cursor);
nir_if *wrapper_if = nir_cf_node_as_if(cursor_block->cf_node.parent);
assert(wrapper_if->cf_node.parent == parent_node);
/* The wrapper if blocks are either put into the body of the main
* function, or within the resume if block of the loops.
*/
assert(parent_node->type == nir_cf_node_function ||
(parent_node->type == nir_cf_node_if &&
parent_node->parent->type == nir_cf_node_loop));
nir_cf_extract(&cf_list, nir_after_cf_node(&wrapper_if->cf_node),
nir_after_cf_list(child_list));
} else {
nir_cf_list_extract(&cf_list, child_list);
}
nir_cf_delete(&cf_list);
return true;
}
static nir_instr *
lower_resume(nir_shader *shader, int call_idx)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_instr *resume_instr = find_resume_instr(impl, call_idx);
if (duplicate_loop_bodies(impl, resume_instr)) {
nir_validate_shader(shader, "after duplicate_loop_bodies in "
"brw_nir_lower_shader_calls");
/* If we duplicated the bodies of any loops, run regs_to_ssa to get rid
* of all those pesky registers we just added.
*/
NIR_PASS_V(shader, nir_lower_regs_to_ssa);
}
/* Re-index nir_ssa_def::index. We don't care about actual liveness in
* this pass but, so we can use the same helpers as the spilling pass, we
* need to make sure that live_index is something sane. It's used
* constantly for determining if an SSA value has been added since the
* start of the pass.
*/
nir_index_ssa_defs(impl);
void *mem_ctx = ralloc_context(shader);
/* Used to track which things may have been assumed to be re-materialized
* by the spilling pass and which we shouldn't delete.
*/
struct brw_bitset remat = bitset_create(mem_ctx, impl->ssa_alloc);
/* Create a nop instruction to use as a cursor as we extract and re-insert
* stuff into the CFG.
*/
nir_builder b;
nir_builder_init(&b, impl);
b.cursor = nir_before_cf_list(&impl->body);
nir_push_if(&b, nir_imm_true(&b));
{
ASSERTED bool found =
flatten_resume_if_ladder(&b, &impl->cf_node, &impl->body,
true, resume_instr, &remat);
assert(found);
}
nir_pop_if(&b, NULL);
ralloc_free(mem_ctx);
nir_validate_shader(shader, "after flatten_resume_if_ladder in "
"brw_nir_lower_shader_calls");
nir_metadata_preserve(impl, nir_metadata_none);
return resume_instr;
}
static void
replace_resume_with_halt(nir_shader *shader, nir_instr *keep)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_builder b;
nir_builder_init(&b, impl);
nir_foreach_block_safe(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr == keep)
continue;
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
if (resume->intrinsic != nir_intrinsic_rt_resume)
continue;
/* If this is some other resume, then we've kicked off a ray or
* bindless thread and we don't want to go any further in this
* shader. Insert a halt so that NIR will delete any instructions
* dominated by this call instruction including the scratch_load
* instructions we inserted.
*/
nir_cf_list cf_list;
nir_cf_extract(&cf_list, nir_after_instr(&resume->instr),
nir_after_block(block));
nir_cf_delete(&cf_list);
b.cursor = nir_instr_remove(&resume->instr);
nir_jump(&b, nir_jump_halt);
break;
}
}
}
/** Lower shader call instructions to split shaders.
*
* Shader calls can be split into an initial shader and a series of "resume"
* shaders. When the shader is first invoked, it is the initial shader which
* is executed. At any point in the initial shader or any one of the resume
* shaders, a shader call operation may be performed. The possible shader call
* operations are:
*
* - trace_ray
* - report_ray_intersection
* - execute_callable
*
* When a shader call operation is performed, we push all live values to the
* stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once
* the operation we invoked is complete, a callee shader will return execution
* to the respective resume shader. The resume shader pops the contents off
* the stack and picks up where the calling shader left off.
*
* Stack management is assumed to be done after this pass. Call
* instructions and their resumes get annotated with stack information that
* should be enough for the backend to implement proper stack management.
*/
bool
nir_lower_shader_calls(nir_shader *shader,
nir_address_format address_format,
unsigned stack_alignment,
nir_shader ***resume_shaders_out,
uint32_t *num_resume_shaders_out,
void *mem_ctx)
{
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
nir_builder b;
nir_builder_init(&b, impl);
int num_calls = 0;
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
if (instr_is_shader_call(instr))
num_calls++;
}
}
if (num_calls == 0) {
nir_shader_preserve_all_metadata(shader);
*num_resume_shaders_out = 0;
return false;
}
/* Some intrinsics not only can't be re-materialized but aren't preserved
* when moving to the continuation shader. We have to move them to the top
* to ensure they get spilled as needed.
*/
{
bool progress = false;
NIR_PASS(progress, shader, move_system_values_to_top);
if (progress)
NIR_PASS(progress, shader, nir_opt_cse);
}
NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls,
num_calls, address_format, stack_alignment);
nir_opt_remove_phis(shader);
/* Make N copies of our shader */
nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls);
for (unsigned i = 0; i < num_calls; i++) {
resume_shaders[i] = nir_shader_clone(mem_ctx, shader);
/* Give them a recognizable name */
resume_shaders[i]->info.name =
ralloc_asprintf(mem_ctx, "%s%sresume_%u",
shader->info.name ? shader->info.name : "",
shader->info.name ? "-" : "",
i);
}
replace_resume_with_halt(shader, NULL);
for (unsigned i = 0; i < num_calls; i++) {
nir_instr *resume_instr = lower_resume(resume_shaders[i], i);
replace_resume_with_halt(resume_shaders[i], resume_instr);
nir_opt_remove_phis(resume_shaders[i]);
/* Remove the dummy blocks added by flatten_resume_if_ladder() */
nir_opt_if(resume_shaders[i], false);
}
*resume_shaders_out = resume_shaders;
*num_resume_shaders_out = num_calls;
return true;
}