mirror of https://gitlab.freedesktop.org/mesa/mesa
2104 lines
74 KiB
C
2104 lines
74 KiB
C
/*
|
|
* Copyright © 2020 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "util/u_dynarray.h"
|
|
#include "util/u_math.h"
|
|
#include "nir.h"
|
|
#include "nir_builder.h"
|
|
#include "nir_phi_builder.h"
|
|
|
|
static bool
|
|
move_system_values_to_top(nir_shader *shader)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
bool progress = false;
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
/* These intrinsics not only can't be re-materialized but aren't
|
|
* preserved when moving to the continuation shader. We have to move
|
|
* them to the top to ensure they get spilled as needed.
|
|
*/
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
switch (intrin->intrinsic) {
|
|
case nir_intrinsic_load_shader_record_ptr:
|
|
case nir_intrinsic_load_btd_local_arg_addr_intel:
|
|
nir_instr_remove(instr);
|
|
nir_instr_insert(nir_before_impl(impl), instr);
|
|
progress = true;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (progress) {
|
|
nir_metadata_preserve(impl, nir_metadata_block_index |
|
|
nir_metadata_dominance);
|
|
} else {
|
|
nir_metadata_preserve(impl, nir_metadata_all);
|
|
}
|
|
|
|
return progress;
|
|
}
|
|
|
|
static bool
|
|
instr_is_shader_call(nir_instr *instr)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
return intrin->intrinsic == nir_intrinsic_trace_ray ||
|
|
intrin->intrinsic == nir_intrinsic_report_ray_intersection ||
|
|
intrin->intrinsic == nir_intrinsic_execute_callable;
|
|
}
|
|
|
|
/* Previously named bitset, it had to be renamed as FreeBSD defines a struct
|
|
* named bitset in sys/_bitset.h required by pthread_np.h which is included
|
|
* from src/util/u_thread.h that is indirectly included by this file.
|
|
*/
|
|
struct sized_bitset {
|
|
BITSET_WORD *set;
|
|
unsigned size;
|
|
};
|
|
|
|
static struct sized_bitset
|
|
bitset_create(void *mem_ctx, unsigned size)
|
|
{
|
|
return (struct sized_bitset){
|
|
.set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)),
|
|
.size = size,
|
|
};
|
|
}
|
|
|
|
static bool
|
|
src_is_in_bitset(nir_src *src, void *_set)
|
|
{
|
|
struct sized_bitset *set = _set;
|
|
|
|
/* Any SSA values which were added after we generated liveness information
|
|
* are things generated by this pass and, while most of it is arithmetic
|
|
* which we could re-materialize, we don't need to because it's only used
|
|
* for a single load/store and so shouldn't cross any shader calls.
|
|
*/
|
|
if (src->ssa->index >= set->size)
|
|
return false;
|
|
|
|
return BITSET_TEST(set->set, src->ssa->index);
|
|
}
|
|
|
|
static void
|
|
add_ssa_def_to_bitset(nir_def *def, struct sized_bitset *set)
|
|
{
|
|
if (def->index >= set->size)
|
|
return;
|
|
|
|
BITSET_SET(set->set, def->index);
|
|
}
|
|
|
|
static bool
|
|
can_remat_instr(nir_instr *instr, struct sized_bitset *remat)
|
|
{
|
|
/* Set of all values which are trivially re-materializable and we shouldn't
|
|
* ever spill them. This includes:
|
|
*
|
|
* - Undef values
|
|
* - Constants
|
|
* - Uniforms (UBO or push constant)
|
|
* - ALU combinations of any of the above
|
|
* - Derefs which are either complete or casts of any of the above
|
|
*
|
|
* Because this pass rewrites things in-order and phis are always turned
|
|
* into register writes, we can use "is it SSA?" to answer the question
|
|
* "can my source be re-materialized?". Register writes happen via
|
|
* non-rematerializable intrinsics.
|
|
*/
|
|
switch (instr->type) {
|
|
case nir_instr_type_alu:
|
|
return nir_foreach_src(instr, src_is_in_bitset, remat);
|
|
|
|
case nir_instr_type_deref:
|
|
return nir_foreach_src(instr, src_is_in_bitset, remat);
|
|
|
|
case nir_instr_type_intrinsic: {
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
switch (intrin->intrinsic) {
|
|
case nir_intrinsic_load_uniform:
|
|
case nir_intrinsic_load_ubo:
|
|
case nir_intrinsic_vulkan_resource_index:
|
|
case nir_intrinsic_vulkan_resource_reindex:
|
|
case nir_intrinsic_load_vulkan_descriptor:
|
|
case nir_intrinsic_load_push_constant:
|
|
case nir_intrinsic_load_global_constant:
|
|
case nir_intrinsic_load_global_const_block_intel:
|
|
case nir_intrinsic_load_desc_set_address_intel:
|
|
/* These intrinsics don't need to be spilled as long as they don't
|
|
* depend on any spilled values.
|
|
*/
|
|
return nir_foreach_src(instr, src_is_in_bitset, remat);
|
|
|
|
case nir_intrinsic_load_scratch_base_ptr:
|
|
case nir_intrinsic_load_ray_launch_id:
|
|
case nir_intrinsic_load_topology_id_intel:
|
|
case nir_intrinsic_load_btd_global_arg_addr_intel:
|
|
case nir_intrinsic_load_btd_resume_sbt_addr_intel:
|
|
case nir_intrinsic_load_ray_base_mem_addr_intel:
|
|
case nir_intrinsic_load_ray_hw_stack_size_intel:
|
|
case nir_intrinsic_load_ray_sw_stack_size_intel:
|
|
case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
|
|
case nir_intrinsic_load_ray_hit_sbt_addr_intel:
|
|
case nir_intrinsic_load_ray_hit_sbt_stride_intel:
|
|
case nir_intrinsic_load_ray_miss_sbt_addr_intel:
|
|
case nir_intrinsic_load_ray_miss_sbt_stride_intel:
|
|
case nir_intrinsic_load_callable_sbt_addr_intel:
|
|
case nir_intrinsic_load_callable_sbt_stride_intel:
|
|
case nir_intrinsic_load_reloc_const_intel:
|
|
case nir_intrinsic_load_ray_query_global_intel:
|
|
case nir_intrinsic_load_ray_launch_size:
|
|
/* Notably missing from the above list is btd_local_arg_addr_intel.
|
|
* This is because the resume shader will have a different local
|
|
* argument pointer because it has a different BSR. Any access of
|
|
* the original shader's local arguments needs to be preserved so
|
|
* that pointer has to be saved on the stack.
|
|
*
|
|
* TODO: There may be some system values we want to avoid
|
|
* re-materializing as well but we have to be very careful
|
|
* to ensure that it's a system value which cannot change
|
|
* across a shader call.
|
|
*/
|
|
return true;
|
|
|
|
case nir_intrinsic_resource_intel:
|
|
return nir_foreach_src(instr, src_is_in_bitset, remat);
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
case nir_instr_type_undef:
|
|
case nir_instr_type_load_const:
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
can_remat_ssa_def(nir_def *def, struct sized_bitset *remat)
|
|
{
|
|
return can_remat_instr(def->parent_instr, remat);
|
|
}
|
|
|
|
struct add_instr_data {
|
|
struct util_dynarray *buf;
|
|
struct sized_bitset *remat;
|
|
};
|
|
|
|
static bool
|
|
add_src_instr(nir_src *src, void *state)
|
|
{
|
|
struct add_instr_data *data = state;
|
|
if (BITSET_TEST(data->remat->set, src->ssa->index))
|
|
return true;
|
|
|
|
util_dynarray_foreach(data->buf, nir_instr *, instr_ptr) {
|
|
if (*instr_ptr == src->ssa->parent_instr)
|
|
return true;
|
|
}
|
|
|
|
/* Abort rematerializing an instruction chain if it is too long. */
|
|
if (data->buf->size >= data->buf->capacity)
|
|
return false;
|
|
|
|
util_dynarray_append(data->buf, nir_instr *, src->ssa->parent_instr);
|
|
return true;
|
|
}
|
|
|
|
static int
|
|
compare_instr_indexes(const void *_inst1, const void *_inst2)
|
|
{
|
|
const nir_instr *const *inst1 = _inst1;
|
|
const nir_instr *const *inst2 = _inst2;
|
|
|
|
return (*inst1)->index - (*inst2)->index;
|
|
}
|
|
|
|
static bool
|
|
can_remat_chain_ssa_def(nir_def *def, struct sized_bitset *remat, struct util_dynarray *buf)
|
|
{
|
|
assert(util_dynarray_num_elements(buf, nir_instr *) == 0);
|
|
|
|
void *mem_ctx = ralloc_context(NULL);
|
|
|
|
/* Add all the instructions involved in build this ssa_def */
|
|
util_dynarray_append(buf, nir_instr *, def->parent_instr);
|
|
|
|
unsigned idx = 0;
|
|
struct add_instr_data data = {
|
|
.buf = buf,
|
|
.remat = remat,
|
|
};
|
|
while (idx < util_dynarray_num_elements(buf, nir_instr *)) {
|
|
nir_instr *instr = *util_dynarray_element(buf, nir_instr *, idx++);
|
|
if (!nir_foreach_src(instr, add_src_instr, &data))
|
|
goto fail;
|
|
}
|
|
|
|
/* Sort instructions by index */
|
|
qsort(util_dynarray_begin(buf),
|
|
util_dynarray_num_elements(buf, nir_instr *),
|
|
sizeof(nir_instr *),
|
|
compare_instr_indexes);
|
|
|
|
/* Create a temporary bitset with all values already
|
|
* rematerialized/rematerializable. We'll add to this bit set as we go
|
|
* through values that might not be in that set but that we can
|
|
* rematerialize.
|
|
*/
|
|
struct sized_bitset potential_remat = bitset_create(mem_ctx, remat->size);
|
|
memcpy(potential_remat.set, remat->set, BITSET_WORDS(remat->size) * sizeof(BITSET_WORD));
|
|
|
|
util_dynarray_foreach(buf, nir_instr *, instr_ptr) {
|
|
nir_def *instr_ssa_def = nir_instr_def(*instr_ptr);
|
|
|
|
/* If already in the potential rematerializable, nothing to do. */
|
|
if (BITSET_TEST(potential_remat.set, instr_ssa_def->index))
|
|
continue;
|
|
|
|
if (!can_remat_instr(*instr_ptr, &potential_remat))
|
|
goto fail;
|
|
|
|
/* All the sources are rematerializable and the instruction is also
|
|
* rematerializable, mark it as rematerializable too.
|
|
*/
|
|
BITSET_SET(potential_remat.set, instr_ssa_def->index);
|
|
}
|
|
|
|
ralloc_free(mem_ctx);
|
|
|
|
return true;
|
|
|
|
fail:
|
|
util_dynarray_clear(buf);
|
|
ralloc_free(mem_ctx);
|
|
return false;
|
|
}
|
|
|
|
static nir_def *
|
|
remat_ssa_def(nir_builder *b, nir_def *def, struct hash_table *remap_table)
|
|
{
|
|
nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table);
|
|
nir_builder_instr_insert(b, clone);
|
|
return nir_instr_def(clone);
|
|
}
|
|
|
|
static nir_def *
|
|
remat_chain_ssa_def(nir_builder *b, struct util_dynarray *buf,
|
|
struct sized_bitset *remat, nir_def ***fill_defs,
|
|
unsigned call_idx, struct hash_table *remap_table)
|
|
{
|
|
nir_def *last_def = NULL;
|
|
|
|
util_dynarray_foreach(buf, nir_instr *, instr_ptr) {
|
|
nir_def *instr_ssa_def = nir_instr_def(*instr_ptr);
|
|
unsigned ssa_index = instr_ssa_def->index;
|
|
|
|
if (fill_defs[ssa_index] != NULL &&
|
|
fill_defs[ssa_index][call_idx] != NULL)
|
|
continue;
|
|
|
|
/* Clone the instruction we want to rematerialize */
|
|
nir_def *clone_ssa_def = remat_ssa_def(b, instr_ssa_def, remap_table);
|
|
|
|
if (fill_defs[ssa_index] == NULL) {
|
|
fill_defs[ssa_index] =
|
|
rzalloc_array(fill_defs, nir_def *, remat->size);
|
|
}
|
|
|
|
/* Add the new ssa_def to the list fill_defs and flag it as
|
|
* rematerialized
|
|
*/
|
|
fill_defs[ssa_index][call_idx] = last_def = clone_ssa_def;
|
|
BITSET_SET(remat->set, ssa_index);
|
|
|
|
_mesa_hash_table_insert(remap_table, instr_ssa_def, last_def);
|
|
}
|
|
|
|
return last_def;
|
|
}
|
|
|
|
struct pbv_array {
|
|
struct nir_phi_builder_value **arr;
|
|
unsigned len;
|
|
};
|
|
|
|
static struct nir_phi_builder_value *
|
|
get_phi_builder_value_for_def(nir_def *def,
|
|
struct pbv_array *pbv_arr)
|
|
{
|
|
if (def->index >= pbv_arr->len)
|
|
return NULL;
|
|
|
|
return pbv_arr->arr[def->index];
|
|
}
|
|
|
|
static nir_def *
|
|
get_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr,
|
|
nir_block *block)
|
|
{
|
|
|
|
struct nir_phi_builder_value *pbv =
|
|
get_phi_builder_value_for_def(src->ssa, pbv_arr);
|
|
if (pbv == NULL)
|
|
return NULL;
|
|
|
|
return nir_phi_builder_value_get_block_def(pbv, block);
|
|
}
|
|
|
|
static bool
|
|
rewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr)
|
|
{
|
|
nir_block *block;
|
|
if (nir_src_parent_instr(src)->type == nir_instr_type_phi) {
|
|
nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src);
|
|
block = phi_src->pred;
|
|
} else {
|
|
block = nir_src_parent_instr(src)->block;
|
|
}
|
|
|
|
nir_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block);
|
|
if (new_def != NULL)
|
|
nir_src_rewrite(src, new_def);
|
|
return true;
|
|
}
|
|
|
|
static nir_def *
|
|
spill_fill(nir_builder *before, nir_builder *after, nir_def *def,
|
|
unsigned value_id, unsigned call_idx,
|
|
unsigned offset, unsigned stack_alignment)
|
|
{
|
|
const unsigned comp_size = def->bit_size / 8;
|
|
|
|
nir_store_stack(before, def,
|
|
.base = offset,
|
|
.call_idx = call_idx,
|
|
.align_mul = MIN2(comp_size, stack_alignment),
|
|
.value_id = value_id,
|
|
.write_mask = BITFIELD_MASK(def->num_components));
|
|
return nir_load_stack(after, def->num_components, def->bit_size,
|
|
.base = offset,
|
|
.call_idx = call_idx,
|
|
.value_id = value_id,
|
|
.align_mul = MIN2(comp_size, stack_alignment));
|
|
}
|
|
|
|
static bool
|
|
add_src_to_call_live_bitset(nir_src *src, void *state)
|
|
{
|
|
BITSET_WORD *call_live = state;
|
|
|
|
BITSET_SET(call_live, src->ssa->index);
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls,
|
|
const nir_lower_shader_calls_options *options)
|
|
{
|
|
/* TODO: If a SSA def is filled more than once, we probably want to just
|
|
* spill it at the LCM of the fill sites so we avoid unnecessary
|
|
* extra spills
|
|
*
|
|
* TODO: If a SSA def is defined outside a loop but live through some call
|
|
* inside the loop, we probably want to spill outside the loop. We
|
|
* may also want to fill outside the loop if it's not used in the
|
|
* loop.
|
|
*
|
|
* TODO: Right now, we only re-materialize things if their immediate
|
|
* sources are things which we filled. We probably want to expand
|
|
* that to re-materialize things whose sources are things we can
|
|
* re-materialize from things we filled. We may want some DAG depth
|
|
* heuristic on this.
|
|
*/
|
|
|
|
/* This happens per-shader rather than per-impl because we mess with
|
|
* nir_shader::scratch_size.
|
|
*/
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
nir_metadata_require(impl, nir_metadata_live_defs |
|
|
nir_metadata_dominance |
|
|
nir_metadata_block_index |
|
|
nir_metadata_instr_index);
|
|
|
|
void *mem_ctx = ralloc_context(shader);
|
|
|
|
const unsigned num_ssa_defs = impl->ssa_alloc;
|
|
const unsigned live_words = BITSET_WORDS(num_ssa_defs);
|
|
struct sized_bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs);
|
|
|
|
/* Array of all live SSA defs which are spill candidates */
|
|
nir_def **spill_defs =
|
|
rzalloc_array(mem_ctx, nir_def *, num_ssa_defs);
|
|
|
|
/* For each spill candidate, an array of every time it's defined by a fill,
|
|
* indexed by call instruction index.
|
|
*/
|
|
nir_def ***fill_defs =
|
|
rzalloc_array(mem_ctx, nir_def **, num_ssa_defs);
|
|
|
|
/* For each call instruction, the liveness set at the call */
|
|
const BITSET_WORD **call_live =
|
|
rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls);
|
|
|
|
/* For each call instruction, the block index of the block it lives in */
|
|
uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls);
|
|
|
|
/* Remap table when rebuilding instructions out of fill operations */
|
|
struct hash_table *trivial_remap_table =
|
|
_mesa_pointer_hash_table_create(mem_ctx);
|
|
|
|
/* Walk the call instructions and fetch the liveness set and block index
|
|
* for each one. We need to do this before we start modifying the shader
|
|
* so that liveness doesn't complain that it's been invalidated. Don't
|
|
* worry, we'll be very careful with our live sets. :-)
|
|
*/
|
|
unsigned call_idx = 0;
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr(instr, block) {
|
|
if (!instr_is_shader_call(instr))
|
|
continue;
|
|
|
|
call_block_indices[call_idx] = block->index;
|
|
|
|
/* The objective here is to preserve values around shader call
|
|
* instructions. Therefore, we use the live set after the
|
|
* instruction as the set of things we want to preserve. Because
|
|
* none of our shader call intrinsics return anything, we don't have
|
|
* to worry about spilling over a return value.
|
|
*
|
|
* TODO: This isn't quite true for report_intersection.
|
|
*/
|
|
call_live[call_idx] =
|
|
nir_get_live_defs(nir_after_instr(instr), mem_ctx);
|
|
|
|
call_idx++;
|
|
}
|
|
}
|
|
|
|
/* If a should_remat_callback is given, call it on each of the live values
|
|
* for each call site. If it returns true we need to rematerialize that
|
|
* instruction (instead of spill/fill). Therefore we need to add the
|
|
* sources as live values so that we can rematerialize on top of those
|
|
* spilled/filled sources.
|
|
*/
|
|
if (options->should_remat_callback) {
|
|
BITSET_WORD **updated_call_live =
|
|
rzalloc_array(mem_ctx, BITSET_WORD *, num_calls);
|
|
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr(instr, block) {
|
|
nir_def *def = nir_instr_def(instr);
|
|
if (def == NULL)
|
|
continue;
|
|
|
|
for (unsigned c = 0; c < num_calls; c++) {
|
|
if (!BITSET_TEST(call_live[c], def->index))
|
|
continue;
|
|
|
|
if (!options->should_remat_callback(def->parent_instr,
|
|
options->should_remat_data))
|
|
continue;
|
|
|
|
if (updated_call_live[c] == NULL) {
|
|
const unsigned bitset_words = BITSET_WORDS(impl->ssa_alloc);
|
|
updated_call_live[c] = ralloc_array(mem_ctx, BITSET_WORD, bitset_words);
|
|
memcpy(updated_call_live[c], call_live[c], bitset_words * sizeof(BITSET_WORD));
|
|
}
|
|
|
|
nir_foreach_src(instr, add_src_to_call_live_bitset, updated_call_live[c]);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (unsigned c = 0; c < num_calls; c++) {
|
|
if (updated_call_live[c] != NULL)
|
|
call_live[c] = updated_call_live[c];
|
|
}
|
|
}
|
|
|
|
nir_builder before, after;
|
|
before = nir_builder_create(impl);
|
|
after = nir_builder_create(impl);
|
|
|
|
call_idx = 0;
|
|
unsigned max_scratch_size = shader->scratch_size;
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
nir_def *def = nir_instr_def(instr);
|
|
if (def != NULL) {
|
|
if (can_remat_ssa_def(def, &trivial_remat)) {
|
|
add_ssa_def_to_bitset(def, &trivial_remat);
|
|
_mesa_hash_table_insert(trivial_remap_table, def, def);
|
|
} else {
|
|
spill_defs[def->index] = def;
|
|
}
|
|
}
|
|
|
|
if (!instr_is_shader_call(instr))
|
|
continue;
|
|
|
|
const BITSET_WORD *live = call_live[call_idx];
|
|
|
|
struct hash_table *remap_table =
|
|
_mesa_hash_table_clone(trivial_remap_table, mem_ctx);
|
|
|
|
/* Make a copy of trivial_remat that we'll update as we crawl through
|
|
* the live SSA defs and unspill them.
|
|
*/
|
|
struct sized_bitset remat = bitset_create(mem_ctx, num_ssa_defs);
|
|
memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD));
|
|
|
|
/* Before the two builders are always separated by the call
|
|
* instruction, it won't break anything to have two of them.
|
|
*/
|
|
before.cursor = nir_before_instr(instr);
|
|
after.cursor = nir_after_instr(instr);
|
|
|
|
/* Array used to hold all the values needed to rematerialize a live
|
|
* value. The capacity is used to determine when we should abort testing
|
|
* a remat chain. In practice, shaders can have chains with more than
|
|
* 10k elements while only chains with less than 16 have realistic
|
|
* chances. There also isn't any performance benefit in rematerializing
|
|
* extremely long chains.
|
|
*/
|
|
nir_instr *remat_chain_instrs[16];
|
|
struct util_dynarray remat_chain;
|
|
util_dynarray_init_from_stack(&remat_chain, remat_chain_instrs, sizeof(remat_chain_instrs));
|
|
|
|
unsigned offset = shader->scratch_size;
|
|
for (unsigned w = 0; w < live_words; w++) {
|
|
BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w];
|
|
while (spill_mask) {
|
|
int i = u_bit_scan(&spill_mask);
|
|
assert(i >= 0);
|
|
unsigned index = w * BITSET_WORDBITS + i;
|
|
assert(index < num_ssa_defs);
|
|
|
|
def = spill_defs[index];
|
|
nir_def *original_def = def, *new_def;
|
|
if (can_remat_ssa_def(def, &remat)) {
|
|
/* If this SSA def is re-materializable or based on other
|
|
* things we've already spilled, re-materialize it rather
|
|
* than spilling and filling. Anything which is trivially
|
|
* re-materializable won't even get here because we take
|
|
* those into account in spill_mask above.
|
|
*/
|
|
new_def = remat_ssa_def(&after, def, remap_table);
|
|
} else if (can_remat_chain_ssa_def(def, &remat, &remat_chain)) {
|
|
new_def = remat_chain_ssa_def(&after, &remat_chain, &remat,
|
|
fill_defs, call_idx,
|
|
remap_table);
|
|
util_dynarray_clear(&remat_chain);
|
|
} else {
|
|
bool is_bool = def->bit_size == 1;
|
|
if (is_bool)
|
|
def = nir_b2b32(&before, def);
|
|
|
|
const unsigned comp_size = def->bit_size / 8;
|
|
offset = ALIGN(offset, comp_size);
|
|
|
|
new_def = spill_fill(&before, &after, def,
|
|
index, call_idx,
|
|
offset, options->stack_alignment);
|
|
|
|
if (is_bool)
|
|
new_def = nir_b2b1(&after, new_def);
|
|
|
|
offset += def->num_components * comp_size;
|
|
}
|
|
|
|
/* Mark this SSA def as available in the remat set so that, if
|
|
* some other SSA def we need is computed based on it, we can
|
|
* just re-compute instead of fetching from memory.
|
|
*/
|
|
BITSET_SET(remat.set, index);
|
|
|
|
/* For now, we just make a note of this new SSA def. We'll
|
|
* fix things up with the phi builder as a second pass.
|
|
*/
|
|
if (fill_defs[index] == NULL) {
|
|
fill_defs[index] =
|
|
rzalloc_array(fill_defs, nir_def *, num_calls);
|
|
}
|
|
fill_defs[index][call_idx] = new_def;
|
|
_mesa_hash_table_insert(remap_table, original_def, new_def);
|
|
}
|
|
}
|
|
|
|
nir_builder *b = &before;
|
|
|
|
offset = ALIGN(offset, options->stack_alignment);
|
|
max_scratch_size = MAX2(max_scratch_size, offset);
|
|
|
|
/* First thing on the called shader's stack is the resume address
|
|
* followed by a pointer to the payload.
|
|
*/
|
|
nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);
|
|
|
|
/* Lower to generic intrinsics with information about the stack & resume shader. */
|
|
switch (call->intrinsic) {
|
|
case nir_intrinsic_trace_ray: {
|
|
nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa,
|
|
call->src[2].ssa, call->src[3].ssa,
|
|
call->src[4].ssa, call->src[5].ssa,
|
|
call->src[6].ssa, call->src[7].ssa,
|
|
call->src[8].ssa, call->src[9].ssa,
|
|
call->src[10].ssa,
|
|
.call_idx = call_idx, .stack_size = offset);
|
|
break;
|
|
}
|
|
|
|
case nir_intrinsic_report_ray_intersection:
|
|
unreachable("Any-hit shaders must be inlined");
|
|
|
|
case nir_intrinsic_execute_callable: {
|
|
nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset);
|
|
break;
|
|
}
|
|
|
|
default:
|
|
unreachable("Invalid shader call instruction");
|
|
}
|
|
|
|
nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset);
|
|
|
|
nir_instr_remove(&call->instr);
|
|
|
|
call_idx++;
|
|
}
|
|
}
|
|
assert(call_idx == num_calls);
|
|
shader->scratch_size = max_scratch_size;
|
|
|
|
struct nir_phi_builder *pb = nir_phi_builder_create(impl);
|
|
struct pbv_array pbv_arr = {
|
|
.arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *,
|
|
num_ssa_defs),
|
|
.len = num_ssa_defs,
|
|
};
|
|
|
|
const unsigned block_words = BITSET_WORDS(impl->num_blocks);
|
|
BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words);
|
|
|
|
/* Go through and set up phi builder values for each spillable value which
|
|
* we ever needed to spill at any point.
|
|
*/
|
|
for (unsigned index = 0; index < num_ssa_defs; index++) {
|
|
if (fill_defs[index] == NULL)
|
|
continue;
|
|
|
|
nir_def *def = spill_defs[index];
|
|
|
|
memset(def_blocks, 0, block_words * sizeof(BITSET_WORD));
|
|
BITSET_SET(def_blocks, def->parent_instr->block->index);
|
|
for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
|
|
if (fill_defs[index][call_idx] != NULL)
|
|
BITSET_SET(def_blocks, call_block_indices[call_idx]);
|
|
}
|
|
|
|
pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components,
|
|
def->bit_size, def_blocks);
|
|
}
|
|
|
|
/* Walk the shader one more time and rewrite SSA defs as needed using the
|
|
* phi builder.
|
|
*/
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
nir_def *def = nir_instr_def(instr);
|
|
if (def != NULL) {
|
|
struct nir_phi_builder_value *pbv =
|
|
get_phi_builder_value_for_def(def, &pbv_arr);
|
|
if (pbv != NULL)
|
|
nir_phi_builder_value_set_block_def(pbv, block, def);
|
|
}
|
|
|
|
if (instr->type == nir_instr_type_phi)
|
|
continue;
|
|
|
|
nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr);
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
|
|
if (resume->intrinsic != nir_intrinsic_rt_resume)
|
|
continue;
|
|
|
|
call_idx = nir_intrinsic_call_idx(resume);
|
|
|
|
/* Technically, this is the wrong place to add the fill defs to the
|
|
* phi builder values because we haven't seen any of the load_scratch
|
|
* instructions for this call yet. However, we know based on how we
|
|
* emitted them that no value ever gets used until after the load
|
|
* instruction has been emitted so this should be safe. If we ever
|
|
* fail validation due this it likely means a bug in our spilling
|
|
* code and not the phi re-construction code here.
|
|
*/
|
|
for (unsigned index = 0; index < num_ssa_defs; index++) {
|
|
if (fill_defs[index] && fill_defs[index][call_idx]) {
|
|
nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block,
|
|
fill_defs[index][call_idx]);
|
|
}
|
|
}
|
|
}
|
|
|
|
nir_if *following_if = nir_block_get_following_if(block);
|
|
if (following_if) {
|
|
nir_def *new_def =
|
|
get_phi_builder_def_for_src(&following_if->condition,
|
|
&pbv_arr, block);
|
|
if (new_def != NULL)
|
|
nir_src_rewrite(&following_if->condition, new_def);
|
|
}
|
|
|
|
/* Handle phi sources that source from this block. We have to do this
|
|
* as a separate pass because the phi builder assumes that uses and
|
|
* defs are processed in an order that respects dominance. When we have
|
|
* loops, a phi source may be a back-edge so we have to handle it as if
|
|
* it were one of the last instructions in the predecessor block.
|
|
*/
|
|
nir_foreach_phi_src_leaving_block(block,
|
|
rewrite_instr_src_from_phi_builder,
|
|
&pbv_arr);
|
|
}
|
|
|
|
nir_phi_builder_finish(pb);
|
|
|
|
ralloc_free(mem_ctx);
|
|
|
|
nir_metadata_preserve(impl, nir_metadata_block_index |
|
|
nir_metadata_dominance);
|
|
}
|
|
|
|
static nir_instr *
|
|
find_resume_instr(nir_function_impl *impl, unsigned call_idx)
|
|
{
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
|
|
if (resume->intrinsic != nir_intrinsic_rt_resume)
|
|
continue;
|
|
|
|
if (nir_intrinsic_call_idx(resume) == call_idx)
|
|
return &resume->instr;
|
|
}
|
|
}
|
|
unreachable("Couldn't find resume instruction");
|
|
}
|
|
|
|
/* Walk the CF tree and duplicate the contents of every loop, one half runs on
|
|
* resume and the other half is for any post-resume loop iterations. We are
|
|
* careful in our duplication to ensure that resume_instr is in the resume
|
|
* half of the loop though a copy of resume_instr will remain in the other
|
|
* half as well in case the same shader call happens twice.
|
|
*/
|
|
static bool
|
|
duplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr)
|
|
{
|
|
nir_def *resume_reg = NULL;
|
|
for (nir_cf_node *node = resume_instr->block->cf_node.parent;
|
|
node->type != nir_cf_node_function; node = node->parent) {
|
|
if (node->type != nir_cf_node_loop)
|
|
continue;
|
|
|
|
nir_loop *loop = nir_cf_node_as_loop(node);
|
|
assert(!nir_loop_has_continue_construct(loop));
|
|
|
|
nir_builder b = nir_builder_create(impl);
|
|
|
|
if (resume_reg == NULL) {
|
|
/* We only create resume_reg if we encounter a loop. This way we can
|
|
* avoid re-validating the shader and calling ssa_to_reg_intrinsics in
|
|
* the case where it's just if-ladders.
|
|
*/
|
|
resume_reg = nir_decl_reg(&b, 1, 1, 0);
|
|
|
|
/* Initialize resume to true at the start of the shader, right after
|
|
* the register is declared at the start.
|
|
*/
|
|
b.cursor = nir_after_instr(resume_reg->parent_instr);
|
|
nir_store_reg(&b, nir_imm_true(&b), resume_reg);
|
|
|
|
/* Set resume to false right after the resume instruction */
|
|
b.cursor = nir_after_instr(resume_instr);
|
|
nir_store_reg(&b, nir_imm_false(&b), resume_reg);
|
|
}
|
|
|
|
/* Before we go any further, make sure that everything which exits the
|
|
* loop or continues around to the top of the loop does so through
|
|
* registers. We're about to duplicate the loop body and we'll have
|
|
* serious trouble if we don't do this.
|
|
*/
|
|
nir_convert_loop_to_lcssa(loop);
|
|
nir_lower_phis_to_regs_block(nir_loop_first_block(loop));
|
|
nir_lower_phis_to_regs_block(
|
|
nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
|
|
|
|
nir_cf_list cf_list;
|
|
nir_cf_list_extract(&cf_list, &loop->body);
|
|
|
|
nir_if *_if = nir_if_create(impl->function->shader);
|
|
b.cursor = nir_after_cf_list(&loop->body);
|
|
_if->condition = nir_src_for_ssa(nir_load_reg(&b, resume_reg));
|
|
nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node);
|
|
|
|
nir_cf_list clone;
|
|
nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL);
|
|
|
|
/* Insert the clone in the else and the original in the then so that
|
|
* the resume_instr remains valid even after the duplication.
|
|
*/
|
|
nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list));
|
|
nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list));
|
|
}
|
|
|
|
if (resume_reg != NULL)
|
|
nir_metadata_preserve(impl, nir_metadata_none);
|
|
|
|
return resume_reg != NULL;
|
|
}
|
|
|
|
static bool
|
|
cf_node_contains_block(nir_cf_node *node, nir_block *block)
|
|
{
|
|
for (nir_cf_node *n = &block->cf_node; n != NULL; n = n->parent) {
|
|
if (n == node)
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void
|
|
rewrite_phis_to_pred(nir_block *block, nir_block *pred)
|
|
{
|
|
nir_foreach_phi(phi, block) {
|
|
ASSERTED bool found = false;
|
|
nir_foreach_phi_src(phi_src, phi) {
|
|
if (phi_src->pred == pred) {
|
|
found = true;
|
|
nir_def_rewrite_uses(&phi->def, phi_src->src.ssa);
|
|
break;
|
|
}
|
|
}
|
|
assert(found);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
cursor_is_after_jump(nir_cursor cursor)
|
|
{
|
|
switch (cursor.option) {
|
|
case nir_cursor_before_instr:
|
|
case nir_cursor_before_block:
|
|
return false;
|
|
case nir_cursor_after_instr:
|
|
return cursor.instr->type == nir_instr_type_jump;
|
|
case nir_cursor_after_block:
|
|
return nir_block_ends_in_jump(cursor.block);
|
|
;
|
|
}
|
|
unreachable("Invalid cursor option");
|
|
}
|
|
|
|
/** Flattens if ladders leading up to a resume
|
|
*
|
|
* Given a resume_instr, this function flattens any if ladders leading to the
|
|
* resume instruction and deletes any code that cannot be encountered on a
|
|
* direct path to the resume instruction. This way we get, for the most part,
|
|
* straight-line control-flow up to the resume instruction.
|
|
*
|
|
* While we do this flattening, we also move any code which is in the remat
|
|
* set up to the top of the function or to the top of the resume portion of
|
|
* the current loop. We don't worry about control-flow as we do this because
|
|
* phis will never be in the remat set (see can_remat_instr) and so nothing
|
|
* control-dependent will ever need to be re-materialized. It is possible
|
|
* that this algorithm will preserve too many instructions by moving them to
|
|
* the top but we leave that for DCE to clean up. Any code not in the remat
|
|
* set is deleted because it's either unused in the continuation or else
|
|
* unspilled from a previous continuation and the unspill code is after the
|
|
* resume instruction.
|
|
*
|
|
* If, for instance, we have something like this:
|
|
*
|
|
* // block 0
|
|
* if (cond1) {
|
|
* // block 1
|
|
* } else {
|
|
* // block 2
|
|
* if (cond2) {
|
|
* // block 3
|
|
* resume;
|
|
* if (cond3) {
|
|
* // block 4
|
|
* }
|
|
* } else {
|
|
* // block 5
|
|
* }
|
|
* }
|
|
*
|
|
* then we know, because we know the resume instruction had to be encoutered,
|
|
* that cond1 = false and cond2 = true and we lower as follows:
|
|
*
|
|
* // block 0
|
|
* // block 2
|
|
* // block 3
|
|
* resume;
|
|
* if (cond3) {
|
|
* // block 4
|
|
* }
|
|
*
|
|
* As you can see, the code in blocks 1 and 5 was removed because there is no
|
|
* path from the start of the shader to the resume instruction which execute
|
|
* blocks 1 or 5. Any remat code from blocks 0, 2, and 3 is preserved and
|
|
* moved to the top. If the resume instruction is inside a loop then we know
|
|
* a priori that it is of the form
|
|
*
|
|
* loop {
|
|
* if (resume) {
|
|
* // Contents containing resume_instr
|
|
* } else {
|
|
* // Second copy of contents
|
|
* }
|
|
* }
|
|
*
|
|
* In this case, we only descend into the first half of the loop. The second
|
|
* half is left alone as that portion is only ever executed after the resume
|
|
* instruction.
|
|
*/
|
|
static bool
|
|
flatten_resume_if_ladder(nir_builder *b,
|
|
nir_cf_node *parent_node,
|
|
struct exec_list *child_list,
|
|
bool child_list_contains_cursor,
|
|
nir_instr *resume_instr,
|
|
struct sized_bitset *remat)
|
|
{
|
|
nir_cf_list cf_list;
|
|
|
|
/* If our child list contains the cursor instruction then we start out
|
|
* before the cursor instruction. We need to know this so that we can skip
|
|
* moving instructions which are already before the cursor.
|
|
*/
|
|
bool before_cursor = child_list_contains_cursor;
|
|
|
|
nir_cf_node *resume_node = NULL;
|
|
foreach_list_typed_safe(nir_cf_node, child, node, child_list) {
|
|
switch (child->type) {
|
|
case nir_cf_node_block: {
|
|
nir_block *block = nir_cf_node_as_block(child);
|
|
if (b->cursor.option == nir_cursor_before_block &&
|
|
b->cursor.block == block) {
|
|
assert(before_cursor);
|
|
before_cursor = false;
|
|
}
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if ((b->cursor.option == nir_cursor_before_instr ||
|
|
b->cursor.option == nir_cursor_after_instr) &&
|
|
b->cursor.instr == instr) {
|
|
assert(nir_cf_node_is_first(&block->cf_node));
|
|
assert(before_cursor);
|
|
before_cursor = false;
|
|
continue;
|
|
}
|
|
|
|
if (instr == resume_instr)
|
|
goto found_resume;
|
|
|
|
if (!before_cursor && can_remat_instr(instr, remat)) {
|
|
nir_instr_remove(instr);
|
|
nir_instr_insert(b->cursor, instr);
|
|
b->cursor = nir_after_instr(instr);
|
|
|
|
nir_def *def = nir_instr_def(instr);
|
|
BITSET_SET(remat->set, def->index);
|
|
}
|
|
}
|
|
if (b->cursor.option == nir_cursor_after_block &&
|
|
b->cursor.block == block) {
|
|
assert(before_cursor);
|
|
before_cursor = false;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case nir_cf_node_if: {
|
|
assert(!before_cursor);
|
|
nir_if *_if = nir_cf_node_as_if(child);
|
|
if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->then_list,
|
|
false, resume_instr, remat)) {
|
|
resume_node = child;
|
|
rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
|
|
nir_if_last_then_block(_if));
|
|
goto found_resume;
|
|
}
|
|
|
|
if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->else_list,
|
|
false, resume_instr, remat)) {
|
|
resume_node = child;
|
|
rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
|
|
nir_if_last_else_block(_if));
|
|
goto found_resume;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case nir_cf_node_loop: {
|
|
assert(!before_cursor);
|
|
nir_loop *loop = nir_cf_node_as_loop(child);
|
|
assert(!nir_loop_has_continue_construct(loop));
|
|
|
|
if (cf_node_contains_block(&loop->cf_node, resume_instr->block)) {
|
|
/* Thanks to our loop body duplication pass, every level of loop
|
|
* containing the resume instruction contains exactly three nodes:
|
|
* two blocks and an if. We don't want to lower away this if
|
|
* because it's the resume selection if. The resume half is
|
|
* always the then_list so that's what we want to flatten.
|
|
*/
|
|
nir_block *header = nir_loop_first_block(loop);
|
|
nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node));
|
|
|
|
/* We want to place anything re-materialized from inside the loop
|
|
* at the top of the resume half of the loop.
|
|
*/
|
|
nir_builder bl = nir_builder_at(nir_before_cf_list(&_if->then_list));
|
|
|
|
ASSERTED bool found =
|
|
flatten_resume_if_ladder(&bl, &_if->cf_node, &_if->then_list,
|
|
true, resume_instr, remat);
|
|
assert(found);
|
|
resume_node = child;
|
|
goto found_resume;
|
|
} else {
|
|
ASSERTED bool found =
|
|
flatten_resume_if_ladder(b, &loop->cf_node, &loop->body,
|
|
false, resume_instr, remat);
|
|
assert(!found);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case nir_cf_node_function:
|
|
unreachable("Unsupported CF node type");
|
|
}
|
|
}
|
|
assert(!before_cursor);
|
|
|
|
/* If we got here, we didn't find the resume node or instruction. */
|
|
return false;
|
|
|
|
found_resume:
|
|
/* If we got here then we found either the resume node or the resume
|
|
* instruction in this CF list.
|
|
*/
|
|
if (resume_node) {
|
|
/* If the resume instruction is buried in side one of our children CF
|
|
* nodes, resume_node now points to that child.
|
|
*/
|
|
if (resume_node->type == nir_cf_node_if) {
|
|
/* Thanks to the recursive call, all of the interesting contents of
|
|
* resume_node have been copied before the cursor. We just need to
|
|
* copy the stuff after resume_node.
|
|
*/
|
|
nir_cf_extract(&cf_list, nir_after_cf_node(resume_node),
|
|
nir_after_cf_list(child_list));
|
|
} else {
|
|
/* The loop contains its own cursor and still has useful stuff in it.
|
|
* We want to move everything after and including the loop to before
|
|
* the cursor.
|
|
*/
|
|
assert(resume_node->type == nir_cf_node_loop);
|
|
nir_cf_extract(&cf_list, nir_before_cf_node(resume_node),
|
|
nir_after_cf_list(child_list));
|
|
}
|
|
} else {
|
|
/* If we found the resume instruction in one of our blocks, grab
|
|
* everything after it in the entire list (not just the one block), and
|
|
* place it before the cursor instr.
|
|
*/
|
|
nir_cf_extract(&cf_list, nir_after_instr(resume_instr),
|
|
nir_after_cf_list(child_list));
|
|
}
|
|
|
|
/* If the resume instruction is in the first block of the child_list,
|
|
* and the cursor is still before that block, the nir_cf_extract() may
|
|
* extract the block object pointed by the cursor, and instead create
|
|
* a new one for the code before the resume. In such case the cursor
|
|
* will be broken, as it will point to a block which is no longer
|
|
* in a function.
|
|
*
|
|
* Luckily, in both cases when this is possible, the intended cursor
|
|
* position is right before the child_list, so we can fix the cursor here.
|
|
*/
|
|
if (child_list_contains_cursor &&
|
|
b->cursor.option == nir_cursor_before_block &&
|
|
b->cursor.block->cf_node.parent == NULL)
|
|
b->cursor = nir_before_cf_list(child_list);
|
|
|
|
if (cursor_is_after_jump(b->cursor)) {
|
|
/* If the resume instruction is in a loop, it's possible cf_list ends
|
|
* in a break or continue instruction, in which case we don't want to
|
|
* insert anything. It's also possible we have an early return if
|
|
* someone hasn't lowered those yet. In either case, nothing after that
|
|
* point executes in this context so we can delete it.
|
|
*/
|
|
nir_cf_delete(&cf_list);
|
|
} else {
|
|
b->cursor = nir_cf_reinsert(&cf_list, b->cursor);
|
|
}
|
|
|
|
if (!resume_node) {
|
|
/* We want the resume to be the first "interesting" instruction */
|
|
nir_instr_remove(resume_instr);
|
|
nir_instr_insert(nir_before_impl(b->impl), resume_instr);
|
|
}
|
|
|
|
/* We've copied everything interesting out of this CF list to before the
|
|
* cursor. Delete everything else.
|
|
*/
|
|
if (child_list_contains_cursor) {
|
|
nir_cf_extract(&cf_list, b->cursor, nir_after_cf_list(child_list));
|
|
} else {
|
|
nir_cf_list_extract(&cf_list, child_list);
|
|
}
|
|
nir_cf_delete(&cf_list);
|
|
|
|
return true;
|
|
}
|
|
|
|
typedef bool (*wrap_instr_callback)(nir_instr *instr);
|
|
|
|
static bool
|
|
wrap_instr(nir_builder *b, nir_instr *instr, void *data)
|
|
{
|
|
wrap_instr_callback callback = data;
|
|
if (!callback(instr))
|
|
return false;
|
|
|
|
b->cursor = nir_before_instr(instr);
|
|
|
|
nir_if *_if = nir_push_if(b, nir_imm_true(b));
|
|
nir_pop_if(b, NULL);
|
|
|
|
nir_cf_list cf_list;
|
|
nir_cf_extract(&cf_list, nir_before_instr(instr), nir_after_instr(instr));
|
|
nir_cf_reinsert(&cf_list, nir_before_block(nir_if_first_then_block(_if)));
|
|
|
|
return true;
|
|
}
|
|
|
|
/* This pass wraps jump instructions in a dummy if block so that when
|
|
* flatten_resume_if_ladder() does its job, it doesn't move a jump instruction
|
|
* directly in front of another instruction which the NIR control flow helpers
|
|
* do not allow.
|
|
*/
|
|
static bool
|
|
wrap_instrs(nir_shader *shader, wrap_instr_callback callback)
|
|
{
|
|
return nir_shader_instructions_pass(shader, wrap_instr,
|
|
nir_metadata_none, callback);
|
|
}
|
|
|
|
static bool
|
|
instr_is_jump(nir_instr *instr)
|
|
{
|
|
return instr->type == nir_instr_type_jump;
|
|
}
|
|
|
|
static nir_instr *
|
|
lower_resume(nir_shader *shader, int call_idx)
|
|
{
|
|
wrap_instrs(shader, instr_is_jump);
|
|
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
nir_instr *resume_instr = find_resume_instr(impl, call_idx);
|
|
|
|
if (duplicate_loop_bodies(impl, resume_instr)) {
|
|
nir_validate_shader(shader, "after duplicate_loop_bodies in "
|
|
"nir_lower_shader_calls");
|
|
/* If we duplicated the bodies of any loops, run reg_intrinsics_to_ssa to
|
|
* get rid of all those pesky registers we just added.
|
|
*/
|
|
NIR_PASS_V(shader, nir_lower_reg_intrinsics_to_ssa);
|
|
}
|
|
|
|
/* Re-index nir_def::index. We don't care about actual liveness in
|
|
* this pass but, so we can use the same helpers as the spilling pass, we
|
|
* need to make sure that live_index is something sane. It's used
|
|
* constantly for determining if an SSA value has been added since the
|
|
* start of the pass.
|
|
*/
|
|
nir_index_ssa_defs(impl);
|
|
|
|
void *mem_ctx = ralloc_context(shader);
|
|
|
|
/* Used to track which things may have been assumed to be re-materialized
|
|
* by the spilling pass and which we shouldn't delete.
|
|
*/
|
|
struct sized_bitset remat = bitset_create(mem_ctx, impl->ssa_alloc);
|
|
|
|
/* Create a nop instruction to use as a cursor as we extract and re-insert
|
|
* stuff into the CFG.
|
|
*/
|
|
nir_builder b = nir_builder_at(nir_before_impl(impl));
|
|
ASSERTED bool found =
|
|
flatten_resume_if_ladder(&b, &impl->cf_node, &impl->body,
|
|
true, resume_instr, &remat);
|
|
assert(found);
|
|
|
|
ralloc_free(mem_ctx);
|
|
|
|
nir_metadata_preserve(impl, nir_metadata_none);
|
|
|
|
nir_validate_shader(shader, "after flatten_resume_if_ladder in "
|
|
"nir_lower_shader_calls");
|
|
|
|
return resume_instr;
|
|
}
|
|
|
|
static void
|
|
replace_resume_with_halt(nir_shader *shader, nir_instr *keep)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
nir_builder b = nir_builder_create(impl);
|
|
|
|
nir_foreach_block_safe(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr == keep)
|
|
continue;
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
|
|
if (resume->intrinsic != nir_intrinsic_rt_resume)
|
|
continue;
|
|
|
|
/* If this is some other resume, then we've kicked off a ray or
|
|
* bindless thread and we don't want to go any further in this
|
|
* shader. Insert a halt so that NIR will delete any instructions
|
|
* dominated by this call instruction including the scratch_load
|
|
* instructions we inserted.
|
|
*/
|
|
nir_cf_list cf_list;
|
|
nir_cf_extract(&cf_list, nir_after_instr(&resume->instr),
|
|
nir_after_block(block));
|
|
nir_cf_delete(&cf_list);
|
|
b.cursor = nir_instr_remove(&resume->instr);
|
|
nir_jump(&b, nir_jump_halt);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
struct lower_scratch_state {
|
|
nir_address_format address_format;
|
|
};
|
|
|
|
static bool
|
|
lower_stack_instr_to_scratch(struct nir_builder *b, nir_instr *instr, void *data)
|
|
{
|
|
struct lower_scratch_state *state = data;
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
nir_intrinsic_instr *stack = nir_instr_as_intrinsic(instr);
|
|
switch (stack->intrinsic) {
|
|
case nir_intrinsic_load_stack: {
|
|
b->cursor = nir_instr_remove(instr);
|
|
nir_def *data, *old_data = nir_instr_def(instr);
|
|
|
|
if (state->address_format == nir_address_format_64bit_global) {
|
|
nir_def *addr = nir_iadd_imm(b,
|
|
nir_load_scratch_base_ptr(b, 1, 64, 1),
|
|
nir_intrinsic_base(stack));
|
|
data = nir_build_load_global(b,
|
|
stack->def.num_components,
|
|
stack->def.bit_size,
|
|
addr,
|
|
.align_mul = nir_intrinsic_align_mul(stack),
|
|
.align_offset = nir_intrinsic_align_offset(stack));
|
|
} else {
|
|
assert(state->address_format == nir_address_format_32bit_offset);
|
|
data = nir_load_scratch(b,
|
|
old_data->num_components,
|
|
old_data->bit_size,
|
|
nir_imm_int(b, nir_intrinsic_base(stack)),
|
|
.align_mul = nir_intrinsic_align_mul(stack),
|
|
.align_offset = nir_intrinsic_align_offset(stack));
|
|
}
|
|
nir_def_rewrite_uses(old_data, data);
|
|
break;
|
|
}
|
|
|
|
case nir_intrinsic_store_stack: {
|
|
b->cursor = nir_instr_remove(instr);
|
|
nir_def *data = stack->src[0].ssa;
|
|
|
|
if (state->address_format == nir_address_format_64bit_global) {
|
|
nir_def *addr = nir_iadd_imm(b,
|
|
nir_load_scratch_base_ptr(b, 1, 64, 1),
|
|
nir_intrinsic_base(stack));
|
|
nir_store_global(b, addr,
|
|
nir_intrinsic_align_mul(stack),
|
|
data,
|
|
nir_component_mask(data->num_components));
|
|
} else {
|
|
assert(state->address_format == nir_address_format_32bit_offset);
|
|
nir_store_scratch(b, data,
|
|
nir_imm_int(b, nir_intrinsic_base(stack)),
|
|
.align_mul = nir_intrinsic_align_mul(stack),
|
|
.write_mask = BITFIELD_MASK(data->num_components));
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
nir_lower_stack_to_scratch(nir_shader *shader,
|
|
nir_address_format address_format)
|
|
{
|
|
struct lower_scratch_state state = {
|
|
.address_format = address_format,
|
|
};
|
|
|
|
return nir_shader_instructions_pass(shader,
|
|
lower_stack_instr_to_scratch,
|
|
nir_metadata_block_index |
|
|
nir_metadata_dominance,
|
|
&state);
|
|
}
|
|
|
|
static bool
|
|
opt_remove_respills_instr(struct nir_builder *b,
|
|
nir_intrinsic_instr *store_intrin, void *data)
|
|
{
|
|
if (store_intrin->intrinsic != nir_intrinsic_store_stack)
|
|
return false;
|
|
|
|
nir_instr *value_instr = store_intrin->src[0].ssa->parent_instr;
|
|
if (value_instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
|
|
nir_intrinsic_instr *load_intrin = nir_instr_as_intrinsic(value_instr);
|
|
if (load_intrin->intrinsic != nir_intrinsic_load_stack)
|
|
return false;
|
|
|
|
if (nir_intrinsic_base(load_intrin) != nir_intrinsic_base(store_intrin))
|
|
return false;
|
|
|
|
nir_instr_remove(&store_intrin->instr);
|
|
return true;
|
|
}
|
|
|
|
/* After shader split, look at stack load/store operations. If we're loading
|
|
* and storing the same value at the same location, we can drop the store
|
|
* instruction.
|
|
*/
|
|
static bool
|
|
nir_opt_remove_respills(nir_shader *shader)
|
|
{
|
|
return nir_shader_intrinsics_pass(shader, opt_remove_respills_instr,
|
|
nir_metadata_block_index |
|
|
nir_metadata_dominance,
|
|
NULL);
|
|
}
|
|
|
|
static void
|
|
add_use_mask(struct hash_table_u64 *offset_to_mask,
|
|
unsigned offset, unsigned mask)
|
|
{
|
|
uintptr_t old_mask = (uintptr_t)
|
|
_mesa_hash_table_u64_search(offset_to_mask, offset);
|
|
|
|
_mesa_hash_table_u64_insert(offset_to_mask, offset,
|
|
(void *)(uintptr_t)(old_mask | mask));
|
|
}
|
|
|
|
/* When splitting the shaders, we might have inserted store & loads of vec4s,
|
|
* because a live value is a 4 components. But sometimes, only some components
|
|
* of that vec4 will be used by after the scratch load. This pass removes the
|
|
* unused components of scratch load/stores.
|
|
*/
|
|
static bool
|
|
nir_opt_trim_stack_values(nir_shader *shader)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
struct hash_table_u64 *value_id_to_mask = _mesa_hash_table_u64_create(NULL);
|
|
bool progress = false;
|
|
|
|
/* Find all the loads and how their value is being used */
|
|
nir_foreach_block_safe(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_load_stack)
|
|
continue;
|
|
|
|
const unsigned value_id = nir_intrinsic_value_id(intrin);
|
|
|
|
const unsigned mask =
|
|
nir_def_components_read(nir_instr_def(instr));
|
|
add_use_mask(value_id_to_mask, value_id, mask);
|
|
}
|
|
}
|
|
|
|
/* For each store, if it stores more than is being used, trim it.
|
|
* Otherwise, remove it from the hash table.
|
|
*/
|
|
nir_foreach_block_safe(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_store_stack)
|
|
continue;
|
|
|
|
const unsigned value_id = nir_intrinsic_value_id(intrin);
|
|
|
|
const unsigned write_mask = nir_intrinsic_write_mask(intrin);
|
|
const unsigned read_mask = (uintptr_t)
|
|
_mesa_hash_table_u64_search(value_id_to_mask, value_id);
|
|
|
|
/* Already removed from the table, nothing to do */
|
|
if (read_mask == 0)
|
|
continue;
|
|
|
|
/* Matching read/write mask, nothing to do, remove from the table. */
|
|
if (write_mask == read_mask) {
|
|
_mesa_hash_table_u64_remove(value_id_to_mask, value_id);
|
|
continue;
|
|
}
|
|
|
|
nir_builder b = nir_builder_at(nir_before_instr(instr));
|
|
|
|
nir_def *value = nir_channels(&b, intrin->src[0].ssa, read_mask);
|
|
nir_src_rewrite(&intrin->src[0], value);
|
|
|
|
intrin->num_components = util_bitcount(read_mask);
|
|
nir_intrinsic_set_write_mask(intrin, (1u << intrin->num_components) - 1);
|
|
|
|
progress = true;
|
|
}
|
|
}
|
|
|
|
/* For each load remaining in the hash table (only the ones we changed the
|
|
* number of components of), apply triming/reswizzle.
|
|
*/
|
|
nir_foreach_block_safe(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_load_stack)
|
|
continue;
|
|
|
|
const unsigned value_id = nir_intrinsic_value_id(intrin);
|
|
|
|
unsigned read_mask = (uintptr_t)
|
|
_mesa_hash_table_u64_search(value_id_to_mask, value_id);
|
|
if (read_mask == 0)
|
|
continue;
|
|
|
|
unsigned swiz_map[NIR_MAX_VEC_COMPONENTS] = {
|
|
0,
|
|
};
|
|
unsigned swiz_count = 0;
|
|
u_foreach_bit(idx, read_mask)
|
|
swiz_map[idx] = swiz_count++;
|
|
|
|
nir_def *def = nir_instr_def(instr);
|
|
|
|
nir_foreach_use_safe(use_src, def) {
|
|
if (nir_src_parent_instr(use_src)->type == nir_instr_type_alu) {
|
|
nir_alu_instr *alu = nir_instr_as_alu(nir_src_parent_instr(use_src));
|
|
nir_alu_src *alu_src = exec_node_data(nir_alu_src, use_src, src);
|
|
|
|
unsigned count = alu->def.num_components;
|
|
for (unsigned idx = 0; idx < count; ++idx)
|
|
alu_src->swizzle[idx] = swiz_map[alu_src->swizzle[idx]];
|
|
} else if (nir_src_parent_instr(use_src)->type == nir_instr_type_intrinsic) {
|
|
nir_intrinsic_instr *use_intrin =
|
|
nir_instr_as_intrinsic(nir_src_parent_instr(use_src));
|
|
assert(nir_intrinsic_has_write_mask(use_intrin));
|
|
unsigned write_mask = nir_intrinsic_write_mask(use_intrin);
|
|
unsigned new_write_mask = 0;
|
|
u_foreach_bit(idx, write_mask)
|
|
new_write_mask |= 1 << swiz_map[idx];
|
|
nir_intrinsic_set_write_mask(use_intrin, new_write_mask);
|
|
} else {
|
|
unreachable("invalid instruction type");
|
|
}
|
|
}
|
|
|
|
intrin->def.num_components = intrin->num_components = swiz_count;
|
|
|
|
progress = true;
|
|
}
|
|
}
|
|
|
|
nir_metadata_preserve(impl,
|
|
progress ? (nir_metadata_dominance |
|
|
nir_metadata_block_index |
|
|
nir_metadata_loop_analysis)
|
|
: nir_metadata_all);
|
|
|
|
_mesa_hash_table_u64_destroy(value_id_to_mask);
|
|
|
|
return progress;
|
|
}
|
|
|
|
struct scratch_item {
|
|
unsigned old_offset;
|
|
unsigned new_offset;
|
|
unsigned bit_size;
|
|
unsigned num_components;
|
|
unsigned value;
|
|
unsigned call_idx;
|
|
};
|
|
|
|
static int
|
|
sort_scratch_item_by_size_and_value_id(const void *_item1, const void *_item2)
|
|
{
|
|
const struct scratch_item *item1 = _item1;
|
|
const struct scratch_item *item2 = _item2;
|
|
|
|
/* By ascending value_id */
|
|
if (item1->bit_size == item2->bit_size)
|
|
return (int)item1->value - (int)item2->value;
|
|
|
|
/* By descending size */
|
|
return (int)item2->bit_size - (int)item1->bit_size;
|
|
}
|
|
|
|
static bool
|
|
nir_opt_sort_and_pack_stack(nir_shader *shader,
|
|
unsigned start_call_scratch,
|
|
unsigned stack_alignment,
|
|
unsigned num_calls)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
void *mem_ctx = ralloc_context(NULL);
|
|
|
|
struct hash_table_u64 *value_id_to_item =
|
|
_mesa_hash_table_u64_create(mem_ctx);
|
|
struct util_dynarray ops;
|
|
util_dynarray_init(&ops, mem_ctx);
|
|
|
|
for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
|
|
_mesa_hash_table_u64_clear(value_id_to_item);
|
|
util_dynarray_clear(&ops);
|
|
|
|
/* Find all the stack load and their offset. */
|
|
nir_foreach_block_safe(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_load_stack)
|
|
continue;
|
|
|
|
if (nir_intrinsic_call_idx(intrin) != call_idx)
|
|
continue;
|
|
|
|
const unsigned value_id = nir_intrinsic_value_id(intrin);
|
|
nir_def *def = nir_instr_def(instr);
|
|
|
|
assert(_mesa_hash_table_u64_search(value_id_to_item,
|
|
value_id) == NULL);
|
|
|
|
struct scratch_item item = {
|
|
.old_offset = nir_intrinsic_base(intrin),
|
|
.bit_size = def->bit_size,
|
|
.num_components = def->num_components,
|
|
.value = value_id,
|
|
};
|
|
|
|
util_dynarray_append(&ops, struct scratch_item, item);
|
|
_mesa_hash_table_u64_insert(value_id_to_item, value_id, (void *)(uintptr_t) true);
|
|
}
|
|
}
|
|
|
|
/* Sort scratch item by component size. */
|
|
if (util_dynarray_num_elements(&ops, struct scratch_item)) {
|
|
qsort(util_dynarray_begin(&ops),
|
|
util_dynarray_num_elements(&ops, struct scratch_item),
|
|
sizeof(struct scratch_item),
|
|
sort_scratch_item_by_size_and_value_id);
|
|
}
|
|
|
|
/* Reorder things on the stack */
|
|
_mesa_hash_table_u64_clear(value_id_to_item);
|
|
|
|
unsigned scratch_size = start_call_scratch;
|
|
util_dynarray_foreach(&ops, struct scratch_item, item) {
|
|
item->new_offset = ALIGN(scratch_size, item->bit_size / 8);
|
|
scratch_size = item->new_offset + (item->bit_size * item->num_components) / 8;
|
|
_mesa_hash_table_u64_insert(value_id_to_item, item->value, item);
|
|
}
|
|
shader->scratch_size = ALIGN(scratch_size, stack_alignment);
|
|
|
|
/* Update offsets in the instructions */
|
|
nir_foreach_block_safe(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
switch (intrin->intrinsic) {
|
|
case nir_intrinsic_load_stack:
|
|
case nir_intrinsic_store_stack: {
|
|
if (nir_intrinsic_call_idx(intrin) != call_idx)
|
|
continue;
|
|
|
|
struct scratch_item *item =
|
|
_mesa_hash_table_u64_search(value_id_to_item,
|
|
nir_intrinsic_value_id(intrin));
|
|
assert(item);
|
|
|
|
nir_intrinsic_set_base(intrin, item->new_offset);
|
|
break;
|
|
}
|
|
|
|
case nir_intrinsic_rt_trace_ray:
|
|
case nir_intrinsic_rt_execute_callable:
|
|
case nir_intrinsic_rt_resume:
|
|
if (nir_intrinsic_call_idx(intrin) != call_idx)
|
|
continue;
|
|
nir_intrinsic_set_stack_size(intrin, shader->scratch_size);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ralloc_free(mem_ctx);
|
|
|
|
nir_shader_preserve_all_metadata(shader);
|
|
|
|
return true;
|
|
}
|
|
|
|
static unsigned
|
|
nir_block_loop_depth(nir_block *block)
|
|
{
|
|
nir_cf_node *node = &block->cf_node;
|
|
unsigned loop_depth = 0;
|
|
|
|
while (node != NULL) {
|
|
if (node->type == nir_cf_node_loop)
|
|
loop_depth++;
|
|
node = node->parent;
|
|
}
|
|
|
|
return loop_depth;
|
|
}
|
|
|
|
/* Find the last block dominating all the uses of a SSA value. */
|
|
static nir_block *
|
|
find_last_dominant_use_block(nir_function_impl *impl, nir_def *value)
|
|
{
|
|
nir_block *old_block = value->parent_instr->block;
|
|
unsigned old_block_loop_depth = nir_block_loop_depth(old_block);
|
|
|
|
nir_foreach_block_reverse_safe(block, impl) {
|
|
bool fits = true;
|
|
|
|
/* Store on the current block of the value */
|
|
if (block == old_block)
|
|
return block;
|
|
|
|
/* Don't move instructions deeper into loops, this would generate more
|
|
* memory traffic.
|
|
*/
|
|
unsigned block_loop_depth = nir_block_loop_depth(block);
|
|
if (block_loop_depth > old_block_loop_depth)
|
|
continue;
|
|
|
|
nir_foreach_if_use(src, value) {
|
|
nir_block *block_before_if =
|
|
nir_cf_node_as_block(nir_cf_node_prev(&nir_src_parent_if(src)->cf_node));
|
|
if (!nir_block_dominates(block, block_before_if)) {
|
|
fits = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!fits)
|
|
continue;
|
|
|
|
nir_foreach_use(src, value) {
|
|
if (nir_src_parent_instr(src)->type == nir_instr_type_phi &&
|
|
block == nir_src_parent_instr(src)->block) {
|
|
fits = false;
|
|
break;
|
|
}
|
|
|
|
if (!nir_block_dominates(block, nir_src_parent_instr(src)->block)) {
|
|
fits = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!fits)
|
|
continue;
|
|
|
|
return block;
|
|
}
|
|
unreachable("Cannot find block");
|
|
}
|
|
|
|
/* Put the scratch loads in the branches where they're needed. */
|
|
static bool
|
|
nir_opt_stack_loads(nir_shader *shader)
|
|
{
|
|
bool progress = false;
|
|
|
|
nir_foreach_function_impl(impl, shader) {
|
|
nir_metadata_require(impl, nir_metadata_dominance |
|
|
nir_metadata_block_index);
|
|
|
|
bool func_progress = false;
|
|
nir_foreach_block_safe(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
continue;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_load_stack)
|
|
continue;
|
|
|
|
nir_def *value = &intrin->def;
|
|
nir_block *new_block = find_last_dominant_use_block(impl, value);
|
|
if (new_block == block)
|
|
continue;
|
|
|
|
/* Move the scratch load in the new block, after the phis. */
|
|
nir_instr_remove(instr);
|
|
nir_instr_insert(nir_before_block_after_phis(new_block), instr);
|
|
|
|
func_progress = true;
|
|
}
|
|
}
|
|
|
|
nir_metadata_preserve(impl,
|
|
func_progress ? (nir_metadata_block_index |
|
|
nir_metadata_dominance |
|
|
nir_metadata_loop_analysis)
|
|
: nir_metadata_all);
|
|
|
|
progress |= func_progress;
|
|
}
|
|
|
|
return progress;
|
|
}
|
|
|
|
static bool
|
|
split_stack_components_instr(struct nir_builder *b,
|
|
nir_intrinsic_instr *intrin, void *data)
|
|
{
|
|
if (intrin->intrinsic != nir_intrinsic_load_stack &&
|
|
intrin->intrinsic != nir_intrinsic_store_stack)
|
|
return false;
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_load_stack &&
|
|
intrin->def.num_components == 1)
|
|
return false;
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_store_stack &&
|
|
intrin->src[0].ssa->num_components == 1)
|
|
return false;
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
|
|
unsigned align_mul = nir_intrinsic_align_mul(intrin);
|
|
unsigned align_offset = nir_intrinsic_align_offset(intrin);
|
|
if (intrin->intrinsic == nir_intrinsic_load_stack) {
|
|
nir_def *components[NIR_MAX_VEC_COMPONENTS] = {
|
|
0,
|
|
};
|
|
for (unsigned c = 0; c < intrin->def.num_components; c++) {
|
|
unsigned offset = c * intrin->def.bit_size / 8;
|
|
components[c] = nir_load_stack(b, 1, intrin->def.bit_size,
|
|
.base = nir_intrinsic_base(intrin) + offset,
|
|
.call_idx = nir_intrinsic_call_idx(intrin),
|
|
.value_id = nir_intrinsic_value_id(intrin),
|
|
.align_mul = align_mul,
|
|
.align_offset = (align_offset + offset) % align_mul);
|
|
}
|
|
|
|
nir_def_rewrite_uses(&intrin->def,
|
|
nir_vec(b, components,
|
|
intrin->def.num_components));
|
|
} else {
|
|
assert(intrin->intrinsic == nir_intrinsic_store_stack);
|
|
for (unsigned c = 0; c < intrin->src[0].ssa->num_components; c++) {
|
|
unsigned offset = c * intrin->src[0].ssa->bit_size / 8;
|
|
nir_store_stack(b, nir_channel(b, intrin->src[0].ssa, c),
|
|
.base = nir_intrinsic_base(intrin) + offset,
|
|
.call_idx = nir_intrinsic_call_idx(intrin),
|
|
.align_mul = align_mul,
|
|
.align_offset = (align_offset + offset) % align_mul,
|
|
.value_id = nir_intrinsic_value_id(intrin),
|
|
.write_mask = 0x1);
|
|
}
|
|
}
|
|
|
|
nir_instr_remove(&intrin->instr);
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Break the load_stack/store_stack intrinsics into single compoments. This
|
|
* helps the vectorizer to pack components.
|
|
*/
|
|
static bool
|
|
nir_split_stack_components(nir_shader *shader)
|
|
{
|
|
return nir_shader_intrinsics_pass(shader, split_stack_components_instr,
|
|
nir_metadata_block_index |
|
|
nir_metadata_dominance,
|
|
NULL);
|
|
}
|
|
|
|
struct stack_op_vectorizer_state {
|
|
nir_should_vectorize_mem_func driver_callback;
|
|
void *driver_data;
|
|
};
|
|
|
|
static bool
|
|
should_vectorize(unsigned align_mul,
|
|
unsigned align_offset,
|
|
unsigned bit_size,
|
|
unsigned num_components,
|
|
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
|
|
void *data)
|
|
{
|
|
/* We only care about those intrinsics */
|
|
if ((low->intrinsic != nir_intrinsic_load_stack &&
|
|
low->intrinsic != nir_intrinsic_store_stack) ||
|
|
(high->intrinsic != nir_intrinsic_load_stack &&
|
|
high->intrinsic != nir_intrinsic_store_stack))
|
|
return false;
|
|
|
|
struct stack_op_vectorizer_state *state = data;
|
|
|
|
return state->driver_callback(align_mul, align_offset,
|
|
bit_size, num_components,
|
|
low, high, state->driver_data);
|
|
}
|
|
|
|
/** Lower shader call instructions to split shaders.
|
|
*
|
|
* Shader calls can be split into an initial shader and a series of "resume"
|
|
* shaders. When the shader is first invoked, it is the initial shader which
|
|
* is executed. At any point in the initial shader or any one of the resume
|
|
* shaders, a shader call operation may be performed. The possible shader call
|
|
* operations are:
|
|
*
|
|
* - trace_ray
|
|
* - report_ray_intersection
|
|
* - execute_callable
|
|
*
|
|
* When a shader call operation is performed, we push all live values to the
|
|
* stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once
|
|
* the operation we invoked is complete, a callee shader will return execution
|
|
* to the respective resume shader. The resume shader pops the contents off
|
|
* the stack and picks up where the calling shader left off.
|
|
*
|
|
* Stack management is assumed to be done after this pass. Call
|
|
* instructions and their resumes get annotated with stack information that
|
|
* should be enough for the backend to implement proper stack management.
|
|
*/
|
|
bool
|
|
nir_lower_shader_calls(nir_shader *shader,
|
|
const nir_lower_shader_calls_options *options,
|
|
nir_shader ***resume_shaders_out,
|
|
uint32_t *num_resume_shaders_out,
|
|
void *mem_ctx)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
int num_calls = 0;
|
|
nir_foreach_block(block, impl) {
|
|
nir_foreach_instr_safe(instr, block) {
|
|
if (instr_is_shader_call(instr))
|
|
num_calls++;
|
|
}
|
|
}
|
|
|
|
if (num_calls == 0) {
|
|
nir_shader_preserve_all_metadata(shader);
|
|
*num_resume_shaders_out = 0;
|
|
return false;
|
|
}
|
|
|
|
/* Some intrinsics not only can't be re-materialized but aren't preserved
|
|
* when moving to the continuation shader. We have to move them to the top
|
|
* to ensure they get spilled as needed.
|
|
*/
|
|
{
|
|
bool progress = false;
|
|
NIR_PASS(progress, shader, move_system_values_to_top);
|
|
if (progress)
|
|
NIR_PASS(progress, shader, nir_opt_cse);
|
|
}
|
|
|
|
/* Deref chains contain metadata information that is needed by other passes
|
|
* after this one. If we don't rematerialize the derefs in the blocks where
|
|
* they're used here, the following lowerings will insert phis which can
|
|
* prevent other passes from chasing deref chains. Additionally, derefs need
|
|
* to be rematerialized after shader call instructions to avoid spilling.
|
|
*/
|
|
{
|
|
bool progress = false;
|
|
NIR_PASS(progress, shader, wrap_instrs, instr_is_shader_call);
|
|
|
|
nir_rematerialize_derefs_in_use_blocks_impl(impl);
|
|
|
|
if (progress)
|
|
NIR_PASS(_, shader, nir_opt_dead_cf);
|
|
}
|
|
|
|
/* Save the start point of the call stack in scratch */
|
|
unsigned start_call_scratch = shader->scratch_size;
|
|
|
|
NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls,
|
|
num_calls, options);
|
|
|
|
NIR_PASS_V(shader, nir_opt_remove_phis);
|
|
|
|
NIR_PASS_V(shader, nir_opt_trim_stack_values);
|
|
NIR_PASS_V(shader, nir_opt_sort_and_pack_stack,
|
|
start_call_scratch, options->stack_alignment, num_calls);
|
|
|
|
/* Make N copies of our shader */
|
|
nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls);
|
|
for (unsigned i = 0; i < num_calls; i++) {
|
|
resume_shaders[i] = nir_shader_clone(mem_ctx, shader);
|
|
|
|
/* Give them a recognizable name */
|
|
resume_shaders[i]->info.name =
|
|
ralloc_asprintf(mem_ctx, "%s%sresume_%u",
|
|
shader->info.name ? shader->info.name : "",
|
|
shader->info.name ? "-" : "",
|
|
i);
|
|
}
|
|
|
|
replace_resume_with_halt(shader, NULL);
|
|
nir_opt_dce(shader);
|
|
nir_opt_dead_cf(shader);
|
|
for (unsigned i = 0; i < num_calls; i++) {
|
|
nir_instr *resume_instr = lower_resume(resume_shaders[i], i);
|
|
replace_resume_with_halt(resume_shaders[i], resume_instr);
|
|
/* Remove CF after halt before nir_opt_if(). */
|
|
nir_opt_dead_cf(resume_shaders[i]);
|
|
/* Remove the dummy blocks added by flatten_resume_if_ladder() */
|
|
nir_opt_if(resume_shaders[i], nir_opt_if_optimize_phi_true_false);
|
|
nir_opt_dce(resume_shaders[i]);
|
|
nir_opt_dead_cf(resume_shaders[i]);
|
|
nir_opt_remove_phis(resume_shaders[i]);
|
|
}
|
|
|
|
for (unsigned i = 0; i < num_calls; i++)
|
|
NIR_PASS_V(resume_shaders[i], nir_opt_remove_respills);
|
|
|
|
if (options->localized_loads) {
|
|
/* Once loads have been combined we can try to put them closer to where
|
|
* they're needed.
|
|
*/
|
|
for (unsigned i = 0; i < num_calls; i++)
|
|
NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads);
|
|
}
|
|
|
|
struct stack_op_vectorizer_state vectorizer_state = {
|
|
.driver_callback = options->vectorizer_callback,
|
|
.driver_data = options->vectorizer_data,
|
|
};
|
|
nir_load_store_vectorize_options vect_opts = {
|
|
.modes = nir_var_shader_temp,
|
|
.callback = should_vectorize,
|
|
.cb_data = &vectorizer_state,
|
|
};
|
|
|
|
if (options->vectorizer_callback != NULL) {
|
|
NIR_PASS_V(shader, nir_split_stack_components);
|
|
NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
|
|
}
|
|
NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format);
|
|
nir_opt_cse(shader);
|
|
for (unsigned i = 0; i < num_calls; i++) {
|
|
if (options->vectorizer_callback != NULL) {
|
|
NIR_PASS_V(resume_shaders[i], nir_split_stack_components);
|
|
NIR_PASS_V(resume_shaders[i], nir_opt_load_store_vectorize, &vect_opts);
|
|
}
|
|
NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch,
|
|
options->address_format);
|
|
nir_opt_cse(resume_shaders[i]);
|
|
}
|
|
|
|
*resume_shaders_out = resume_shaders;
|
|
*num_resume_shaders_out = num_calls;
|
|
|
|
return true;
|
|
}
|