From c78be5da300ae386a12b91a22efb064335e2043a Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 21 Jun 2021 13:44:53 +0300 Subject: [PATCH] intel/fs: lower ray query intrinsics v2: Add helper for acceleration->root_node computation (Caio) v3: Update comment on "done" bit (Caio) Remove progress bool value for impl function (Caio) Don't use nir_shader_instructions_pass to search the shader (Caio) v4: Rename variable for if/else block (Caio) Signed-off-by: Lionel Landwerlin Reviewed-by: Caio Oliveira Part-of: --- src/compiler/nir/nir_intrinsics.py | 1 + src/compiler/nir/nir_lower_shader_calls.c | 1 + src/intel/compiler/brw_nir.c | 3 + .../compiler/brw_nir_lower_ray_queries.c | 590 ++++++++++++++++++ .../compiler/brw_nir_lower_shader_calls.c | 7 +- src/intel/compiler/brw_nir_rt.h | 3 + src/intel/compiler/brw_nir_rt_builder.h | 28 + src/intel/compiler/brw_rt.h | 40 ++ src/intel/compiler/meson.build | 1 + 9 files changed, 668 insertions(+), 6 deletions(-) create mode 100644 src/intel/compiler/brw_nir_lower_ray_queries.c diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index f41864f8e36..1a5f7545685 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1392,3 +1392,4 @@ system_value("leaf_procedural_intel", 1, bit_sizes=[1]) # 2: Miss # 3: Intersection system_value("btd_shader_type_intel", 1) +system_value("ray_query_global_intel", 1, bit_sizes=[64]) diff --git a/src/compiler/nir/nir_lower_shader_calls.c b/src/compiler/nir/nir_lower_shader_calls.c index abe29580234..bf6f763ff9d 100644 --- a/src/compiler/nir/nir_lower_shader_calls.c +++ b/src/compiler/nir/nir_lower_shader_calls.c @@ -177,6 +177,7 @@ can_remat_instr(nir_instr *instr, struct brw_bitset *remat) case nir_intrinsic_load_callable_sbt_addr_intel: case nir_intrinsic_load_callable_sbt_stride_intel: case nir_intrinsic_load_reloc_const_intel: + case nir_intrinsic_load_ray_query_global_intel: /* Notably missing from the above list is btd_local_arg_addr_intel. * This is because the resume shader will have a different local * argument pointer because it has a different BSR. Any access of diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9998edfc735..800f99e69e3 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -22,6 +22,7 @@ */ #include "brw_nir.h" +#include "brw_nir_rt.h" #include "brw_shader.h" #include "dev/intel_debug.h" #include "compiler/glsl_types.h" @@ -547,6 +548,8 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_dead_write_vars); OPT(nir_opt_combine_stores, nir_var_all); + OPT(nir_opt_ray_queries); + if (is_scalar) { OPT(nir_lower_alu_to_scalar, NULL, NULL); } else { diff --git a/src/intel/compiler/brw_nir_lower_ray_queries.c b/src/intel/compiler/brw_nir_lower_ray_queries.c new file mode 100644 index 00000000000..60c248fe72a --- /dev/null +++ b/src/intel/compiler/brw_nir_lower_ray_queries.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2021 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir_rt.h" +#include "brw_nir_rt_builder.h" + +#include "nir_deref.h" + +#include "util/macros.h" + +struct lowering_state { + const struct intel_device_info *devinfo; + + struct hash_table *queries; + uint32_t n_queries; + + struct brw_nir_rt_globals_defs globals; + nir_ssa_def *rq_globals; +}; + +struct brw_ray_query { + nir_variable *opaque_var; + uint32_t id; +}; + +static bool +need_spill_fill(struct lowering_state *state) +{ + return state->n_queries > 1; +} + +/** + * This pass converts opaque RayQuery structures from SPIRV into a vec3 where + * the first 2 elements store a global address for the query and the third + * element is an incremented counter on the number of executed + * nir_intrinsic_rq_proceed. + */ + +static bool +maybe_create_brw_var(nir_instr *instr, struct lowering_state *state) +{ + if (instr->type != nir_instr_type_deref) + return false; + + nir_deref_instr *deref = nir_instr_as_deref(instr); + if (deref->deref_type != nir_deref_type_var && + deref->deref_type != nir_deref_type_array) + return false; + + nir_variable *opaque_var = nir_deref_instr_get_variable(deref); + if (!opaque_var || !opaque_var->data.ray_query) + return false; + + struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var); + if (entry) + return false; + + struct brw_ray_query *rq = rzalloc(state->queries, struct brw_ray_query); + rq->opaque_var = opaque_var; + rq->id = state->n_queries; + + _mesa_hash_table_insert(state->queries, opaque_var, rq); + + unsigned aoa_size = glsl_get_aoa_size(opaque_var->type); + state->n_queries += MAX2(1, aoa_size); + + return true; +} + +static nir_ssa_def * +get_ray_query_shadow_addr(nir_builder *b, + nir_deref_instr *deref, + struct lowering_state *state, + nir_ssa_def **out_state_addr) +{ + nir_deref_path path; + nir_deref_path_init(&path, deref, NULL); + assert(path.path[0]->deref_type == nir_deref_type_var); + + nir_variable *opaque_var = nir_deref_instr_get_variable(path.path[0]); + struct hash_entry *entry = _mesa_hash_table_search(state->queries, opaque_var); + assert(entry); + + struct brw_ray_query *rq = entry->data; + + /* Base address in the shadow memory of the variable associated with this + * ray query variable. + */ + nir_ssa_def *base_addr = + nir_iadd_imm(b, state->globals.resume_sbt_addr, + brw_rt_ray_queries_shadow_stack_size(state->devinfo) * rq->id); + + bool spill_fill = need_spill_fill(state); + *out_state_addr = + spill_fill ? + nir_iadd_imm(b, + state->globals.resume_sbt_addr, + brw_rt_ray_queries_shadow_stack_size(state->devinfo) * + b->shader->info.ray_queries + + 4 * rq->id) : + state->globals.resume_sbt_addr; + + if (!spill_fill) + return NULL; + + /* Just emit code and let constant-folding go to town */ + nir_deref_instr **p = &path.path[1]; + for (; *p; p++) { + if ((*p)->deref_type == nir_deref_type_array) { + nir_ssa_def *index = nir_ssa_for_src(b, (*p)->arr.index, 1); + + /**/ + uint32_t local_state_offset = 4 * MAX2(1, glsl_get_aoa_size((*p)->type)); + *out_state_addr = + nir_iadd(b, *out_state_addr, + nir_i2i64(b, + nir_imul_imm(b, index, local_state_offset))); + + /**/ + uint64_t size = MAX2(1, glsl_get_aoa_size((*p)->type)) * + brw_rt_ray_queries_shadow_stack_size(state->devinfo); + + nir_ssa_def *mul = nir_amul_imm(b, nir_i2i64(b, index), size); + + base_addr = nir_iadd(b, base_addr, mul); + } else { + unreachable("Unsupported deref type"); + } + } + + nir_deref_path_finish(&path); + + /* Add the lane offset to the shadow memory address */ + nir_ssa_def *lane_offset = + nir_imul_imm( + b, + nir_iadd( + b, + nir_imul( + b, + brw_load_btd_dss_id(b), + brw_nir_rt_load_num_simd_lanes_per_dss(b, state->devinfo)), + brw_nir_rt_sync_stack_id(b)), + BRW_RT_SIZEOF_SHADOW_RAY_QUERY); + + return nir_iadd(b, base_addr, nir_i2i64(b, lane_offset)); +} + +static void +update_trace_ctrl_level(nir_builder *b, + nir_ssa_def *state_addr, + nir_ssa_def **out_old_ctrl, + nir_ssa_def **out_old_level, + nir_ssa_def *new_ctrl, + nir_ssa_def *new_level) +{ + nir_ssa_def *old_value = brw_nir_rt_load(b, state_addr, 4, 1, 32); + nir_ssa_def *old_ctrl = nir_ishr_imm(b, old_value, 2); + nir_ssa_def *old_level = nir_iand_imm(b, old_value, 0x3); + + if (out_old_ctrl) + *out_old_ctrl = old_ctrl; + if (out_old_level) + *out_old_level = old_level; + + if (new_ctrl || new_level) { + if (!new_ctrl) + new_ctrl = old_ctrl; + if (!new_level) + new_level = old_level; + + nir_ssa_def *new_value = nir_ior(b, nir_ishl_imm(b, new_ctrl, 2), new_level); + brw_nir_rt_store(b, state_addr, 4, new_value, 0x1); + } +} + +static void +fill_query(nir_builder *b, + nir_ssa_def *hw_stack_addr, + nir_ssa_def *shadow_stack_addr, + nir_ssa_def *ctrl) +{ + brw_nir_memcpy_global(b, + brw_nir_rt_mem_hit_addr_from_addr(b, hw_stack_addr, false), 16, + brw_nir_rt_mem_hit_addr_from_addr(b, shadow_stack_addr, false), 16, + BRW_RT_SIZEOF_HIT_INFO); + brw_nir_memcpy_global(b, + brw_nir_rt_mem_hit_addr_from_addr(b, hw_stack_addr, true), 16, + brw_nir_rt_mem_hit_addr_from_addr(b, shadow_stack_addr, true), 16, + BRW_RT_SIZEOF_HIT_INFO); + brw_nir_memcpy_global(b, + brw_nir_rt_mem_ray_addr(b, hw_stack_addr, + BRW_RT_BVH_LEVEL_WORLD), 16, + brw_nir_rt_mem_ray_addr(b, shadow_stack_addr, + BRW_RT_BVH_LEVEL_WORLD), 16, + BRW_RT_SIZEOF_RAY); +} + +static void +spill_query(nir_builder *b, + nir_ssa_def *hw_stack_addr, + nir_ssa_def *shadow_stack_addr) +{ + struct brw_nir_rt_mem_hit_defs committed_hit = {}; + brw_nir_rt_load_mem_hit_from_addr(b, &committed_hit, hw_stack_addr, true); + + /* Always copy the potential hit back */ + brw_nir_memcpy_global(b, + brw_nir_rt_mem_hit_addr_from_addr(b, shadow_stack_addr, false), 16, + brw_nir_rt_mem_hit_addr_from_addr(b, hw_stack_addr, false), 16, + BRW_RT_SIZEOF_HIT_INFO); + + /* Also copy the committed hit back if it is valid */ + nir_push_if(b, committed_hit.valid); + { + brw_nir_memcpy_global(b, + brw_nir_rt_mem_hit_addr_from_addr(b, shadow_stack_addr, true), 16, + brw_nir_rt_mem_hit_addr_from_addr(b, hw_stack_addr, true), 16, + BRW_RT_SIZEOF_HIT_INFO); + } + nir_pop_if(b, NULL); +} + + +static void +lower_ray_query_intrinsic(nir_builder *b, + nir_intrinsic_instr *intrin, + struct lowering_state *state) +{ + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + b->cursor = nir_instr_remove(&intrin->instr); + + nir_ssa_def *ctrl_level_addr; + nir_ssa_def *shadow_stack_addr = + get_ray_query_shadow_addr(b, deref, state, &ctrl_level_addr); + nir_ssa_def *hw_stack_addr = + brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr, state->devinfo); + nir_ssa_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr; + + switch (intrin->intrinsic) { + case nir_intrinsic_rq_initialize: { + nir_ssa_def *as_addr = intrin->src[1].ssa; + nir_ssa_def *ray_flags = intrin->src[2].ssa; + /* From the SPIR-V spec: + * + * "Only the 8 least-significant bits of Cull Mask are used by + * this instruction - other bits are ignored. + * + * Only the 16 least-significant bits of Miss Index are used by + * this instruction - other bits are ignored." + */ + nir_ssa_def *cull_mask = nir_iand_imm(b, intrin->src[3].ssa, 0xff); + nir_ssa_def *ray_orig = intrin->src[4].ssa; + nir_ssa_def *ray_t_min = intrin->src[5].ssa; + nir_ssa_def *ray_dir = intrin->src[6].ssa; + nir_ssa_def *ray_t_max = intrin->src[7].ssa; + + nir_ssa_def *root_node_ptr = + brw_nir_rt_acceleration_structure_to_root_node(b, as_addr); + + struct brw_nir_rt_mem_ray_defs ray_defs = { + .root_node_ptr = root_node_ptr, + .ray_flags = nir_u2u16(b, ray_flags), + .ray_mask = cull_mask, + .orig = ray_orig, + .t_near = ray_t_min, + .dir = ray_dir, + .t_far = ray_t_max, + }; + + nir_ssa_def *ray_addr = + brw_nir_rt_mem_ray_addr(b, stack_addr, BRW_RT_BVH_LEVEL_WORLD); + + brw_nir_rt_query_mark_init(b, stack_addr); + brw_nir_rt_init_mem_hit_at_addr(b, stack_addr, false, ray_t_max); + brw_nir_rt_init_mem_hit_at_addr(b, stack_addr, true, ray_t_max); + brw_nir_rt_store_mem_ray_query_at_addr(b, ray_addr, &ray_defs); + + update_trace_ctrl_level(b, ctrl_level_addr, + NULL, NULL, + nir_imm_int(b, GEN_RT_TRACE_RAY_INITAL), + nir_imm_int(b, BRW_RT_BVH_LEVEL_WORLD)); + break; + } + + case nir_intrinsic_rq_proceed: { + nir_ssa_def *not_done = + nir_inot(b, brw_nir_rt_query_done(b, stack_addr)); + nir_ssa_def *not_done_then, *not_done_else; + + nir_push_if(b, not_done); + { + nir_ssa_def *ctrl, *level; + update_trace_ctrl_level(b, ctrl_level_addr, + &ctrl, &level, + NULL, + NULL); + + /* Mark the query as done because handing it over to the HW for + * processing. If the HW make any progress, it will write back some + * data and as a side effect, clear the "done" bit. If no progress is + * made, HW does not write anything back and we can use this bit to + * detect that. + */ + brw_nir_rt_query_mark_done(b, stack_addr); + + if (shadow_stack_addr) + fill_query(b, hw_stack_addr, shadow_stack_addr, ctrl); + + nir_trace_ray_intel(b, state->rq_globals, level, ctrl, .synchronous = true); + + struct brw_nir_rt_mem_hit_defs hit_in = {}; + brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, hw_stack_addr, false); + + if (shadow_stack_addr) + spill_query(b, hw_stack_addr, shadow_stack_addr); + + update_trace_ctrl_level(b, ctrl_level_addr, + NULL, NULL, + nir_imm_int(b, GEN_RT_TRACE_RAY_CONTINUE), + nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT)); + + not_done_then = nir_inot(b, hit_in.done); + } + nir_push_else(b, NULL); + { + not_done_else = nir_imm_false(b); + } + nir_pop_if(b, NULL); + not_done = nir_if_phi(b, not_done_then, not_done_else); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, not_done); + break; + } + + case nir_intrinsic_rq_confirm_intersection: { + brw_nir_memcpy_global(b, + brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true), 16, + brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false), 16, + BRW_RT_SIZEOF_HIT_INFO); + update_trace_ctrl_level(b, ctrl_level_addr, + NULL, NULL, + nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT), + nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT)); + break; + } + + case nir_intrinsic_rq_generate_intersection: { + brw_nir_rt_generate_hit_addr(b, stack_addr, intrin->src[1].ssa); + update_trace_ctrl_level(b, ctrl_level_addr, + NULL, NULL, + nir_imm_int(b, GEN_RT_TRACE_RAY_COMMIT), + nir_imm_int(b, BRW_RT_BVH_LEVEL_OBJECT)); + break; + } + + case nir_intrinsic_rq_terminate: { + brw_nir_rt_query_mark_done(b, stack_addr); + break; + } + + case nir_intrinsic_rq_load: { + const bool committed = nir_src_as_bool(intrin->src[1]); + + struct brw_nir_rt_mem_ray_defs world_ray_in = {}; + struct brw_nir_rt_mem_ray_defs object_ray_in = {}; + struct brw_nir_rt_mem_hit_defs hit_in = {}; + brw_nir_rt_load_mem_ray_from_addr(b, &world_ray_in, stack_addr, + BRW_RT_BVH_LEVEL_WORLD); + brw_nir_rt_load_mem_ray_from_addr(b, &object_ray_in, stack_addr, + BRW_RT_BVH_LEVEL_OBJECT); + brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr, committed); + + nir_ssa_def *sysval = NULL; + switch (nir_intrinsic_base(intrin)) { + case nir_ray_query_value_intersection_type: + if (committed) { + /* Values we want to generate : + * + * RayQueryCommittedIntersectionNoneEXT = 0U <= hit_in.valid == false + * RayQueryCommittedIntersectionTriangleEXT = 1U <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_QUAD (4) + * RayQueryCommittedIntersectionGeneratedEXT = 2U <= hit_in.leaf_type == BRW_RT_BVH_NODE_TYPE_PROCEDURAL (3) + */ + sysval = + nir_bcsel(b, nir_ieq(b, hit_in.leaf_type, nir_imm_int(b, 4)), + nir_imm_int(b, 1), nir_imm_int(b, 2)); + sysval = + nir_bcsel(b, hit_in.valid, + sysval, nir_imm_int(b, 0)); + } else { + /* 0 -> triangle, 1 -> AABB */ + sysval = + nir_b2i32(b, + nir_ieq(b, hit_in.leaf_type, + nir_imm_int(b, BRW_RT_BVH_NODE_TYPE_PROCEDURAL))); + } + break; + + case nir_ray_query_value_intersection_t: + sysval = hit_in.t; + break; + + case nir_ray_query_value_intersection_instance_custom_index: { + struct brw_nir_rt_bvh_instance_leaf_defs leaf; + brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); + sysval = leaf.instance_id; + break; + } + + case nir_ray_query_value_intersection_instance_id: { + struct brw_nir_rt_bvh_instance_leaf_defs leaf; + brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); + sysval = leaf.instance_index; + break; + } + + case nir_ray_query_value_intersection_instance_sbt_index: { + struct brw_nir_rt_bvh_instance_leaf_defs leaf; + brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); + sysval = leaf.contribution_to_hit_group_index; + break; + } + + case nir_ray_query_value_intersection_geometry_index: { + nir_ssa_def *geometry_index_dw = + nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4, + 1, 32); + sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29)); + break; + } + + case nir_ray_query_value_intersection_primitive_index: + sysval = brw_nir_rt_load_primitive_id_from_hit(b, NULL /* is_procedural */, &hit_in); + break; + + case nir_ray_query_value_intersection_barycentrics: + sysval = hit_in.tri_bary; + break; + + case nir_ray_query_value_intersection_front_face: + sysval = hit_in.front_face; + break; + + case nir_ray_query_value_intersection_object_ray_direction: + sysval = world_ray_in.dir; + break; + + case nir_ray_query_value_intersection_object_ray_origin: + sysval = world_ray_in.orig; + break; + + case nir_ray_query_value_intersection_object_to_world: { + struct brw_nir_rt_bvh_instance_leaf_defs leaf; + brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); + sysval = leaf.object_to_world[nir_intrinsic_column(intrin)]; + break; + } + + case nir_ray_query_value_intersection_world_to_object: { + struct brw_nir_rt_bvh_instance_leaf_defs leaf; + brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr); + sysval = leaf.world_to_object[nir_intrinsic_column(intrin)]; + break; + } + + case nir_ray_query_value_intersection_candidate_aabb_opaque: + sysval = hit_in.front_face; + break; + + case nir_ray_query_value_tmin: + sysval = world_ray_in.t_near; + break; + + case nir_ray_query_value_flags: + sysval = nir_u2u32(b, world_ray_in.ray_flags); + break; + + case nir_ray_query_value_world_ray_direction: + sysval = world_ray_in.dir; + break; + + case nir_ray_query_value_world_ray_origin: + sysval = world_ray_in.orig; + break; + + default: + unreachable("Invalid ray query"); + } + + assert(sysval); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, sysval); + break; + } + + default: + unreachable("Invalid intrinsic"); + } +} + +static void +lower_ray_query_impl(nir_function_impl *impl, struct lowering_state *state) +{ + nir_builder _b, *b = &_b; + nir_builder_init(&_b, impl); + + b->cursor = nir_before_block(nir_start_block(b->impl)); + + state->rq_globals = nir_load_ray_query_global_intel(b); + + brw_nir_rt_load_globals_addr(b, &state->globals, state->rq_globals); + + nir_foreach_block_safe(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_rq_initialize && + intrin->intrinsic != nir_intrinsic_rq_terminate && + intrin->intrinsic != nir_intrinsic_rq_proceed && + intrin->intrinsic != nir_intrinsic_rq_generate_intersection && + intrin->intrinsic != nir_intrinsic_rq_confirm_intersection && + intrin->intrinsic != nir_intrinsic_rq_load) + continue; + + lower_ray_query_intrinsic(b, intrin, state); + } + } + + nir_metadata_preserve(impl, nir_metadata_none); +} + +bool +brw_nir_lower_ray_queries(nir_shader *shader, + const struct intel_device_info *devinfo) +{ + struct lowering_state state = { + .devinfo = devinfo, + .queries = _mesa_pointer_hash_table_create(NULL), + }; + + assert(exec_list_length(&shader->functions) == 1); + + /* Find query variables */ + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + nir_foreach_block_safe(block, function->impl) { + nir_foreach_instr(instr, block) + maybe_create_brw_var(instr, &state); + } + } + + if (_mesa_hash_table_num_entries(state.queries) > 0) { + nir_foreach_function(function, shader) { + if (function->impl) + lower_ray_query_impl(function->impl, &state); + } + + nir_remove_dead_derefs(shader); + nir_remove_dead_variables(shader, + nir_var_shader_temp | nir_var_function_temp, + NULL); + } + + ralloc_free(state.queries); + + return true; +} diff --git a/src/intel/compiler/brw_nir_lower_shader_calls.c b/src/intel/compiler/brw_nir_lower_shader_calls.c index f94788dfe30..fa048510590 100644 --- a/src/intel/compiler/brw_nir_lower_shader_calls.c +++ b/src/intel/compiler/brw_nir_lower_shader_calls.c @@ -163,13 +163,8 @@ lower_shader_calls_instr(struct nir_builder *b, nir_instr *instr, void *data) nir_ssa_def *ray_dir = call->src[8].ssa; nir_ssa_def *ray_t_max = call->src[9].ssa; - /* The hardware packet takes the address to the root node in the - * acceleration structure, not the acceleration structure itself. To - * find that, we have to read the root node offset from the acceleration - * structure which is the first QWord. - */ nir_ssa_def *root_node_ptr = - nir_iadd(b, as_addr, nir_load_global(b, as_addr, 256, 1, 64)); + brw_nir_rt_acceleration_structure_to_root_node(b, as_addr); /* The hardware packet requires an address to the first element of the * hit SBT. diff --git a/src/intel/compiler/brw_nir_rt.h b/src/intel/compiler/brw_nir_rt.h index 53b90e2e549..a3c6461207b 100644 --- a/src/intel/compiler/brw_nir_rt.h +++ b/src/intel/compiler/brw_nir_rt.h @@ -49,6 +49,9 @@ void brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection, /* We require the stack to be 8B aligned at the start of a shader */ #define BRW_BTD_STACK_ALIGN 8 +bool brw_nir_lower_ray_queries(nir_shader *shader, + const struct intel_device_info *devinfo); + void brw_nir_lower_shader_returns(nir_shader *shader); bool brw_nir_lower_shader_calls(nir_shader *shader); diff --git a/src/intel/compiler/brw_nir_rt_builder.h b/src/intel/compiler/brw_nir_rt_builder.h index 86de1ad0e88..9d5d2eb2353 100644 --- a/src/intel/compiler/brw_nir_rt_builder.h +++ b/src/intel/compiler/brw_nir_rt_builder.h @@ -907,4 +907,32 @@ brw_nir_rt_load_primitive_id_from_hit(nir_builder *b, 4, /* align */ 1, 32); } +static inline nir_ssa_def * +brw_nir_rt_acceleration_structure_to_root_node(nir_builder *b, + nir_ssa_def *as_addr) +{ + /* The HW memory structure in which we specify what acceleration structure + * to traverse, takes the address to the root node in the acceleration + * structure, not the acceleration structure itself. To find that, we have + * to read the root node offset from the acceleration structure which is + * the first QWord. + * + * But if the acceleration structure pointer is NULL, then we should return + * NULL as root node pointer. + */ + nir_ssa_def *root_node_ptr, *null_node_ptr; + nir_push_if(b, nir_ieq(b, as_addr, nir_imm_int64(b, 0))); + { + null_node_ptr = nir_imm_int64(b, 0); + } + nir_push_else(b, NULL); + { + root_node_ptr = + nir_iadd(b, as_addr, brw_nir_rt_load(b, as_addr, 256, 1, 64)); + } + nir_pop_if(b, NULL); + + return nir_if_phi(b, null_node_ptr, root_node_ptr); +} + #endif /* BRW_NIR_RT_BUILDER_H */ diff --git a/src/intel/compiler/brw_rt.h b/src/intel/compiler/brw_rt.h index add5ec66629..2452d72f120 100644 --- a/src/intel/compiler/brw_rt.h +++ b/src/intel/compiler/brw_rt.h @@ -31,6 +31,9 @@ extern "C" { /** Vulkan defines shaderGroupHandleSize = 32 */ #define BRW_RT_SBT_HANDLE_SIZE 32 +/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */ +#define BRW_RT_DISPATCH_GLOBALS_SIZE 80 + /** Offset after the RT dispatch globals at which "push" constants live */ #define BRW_RT_PUSH_CONST_OFFSET 128 @@ -177,6 +180,10 @@ struct brw_rt_raygen_trampoline_params { (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \ (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0)) +#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \ + (BRW_RT_SIZEOF_HIT_INFO * 2 + \ + (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS) + #define BRW_RT_SIZEOF_HW_STACK \ (BRW_RT_SIZEOF_HIT_INFO * 2 + \ BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \ @@ -228,6 +235,39 @@ brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout, layout->total_size = size; } +static inline uint32_t +brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo) +{ + /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids + * which includes all the threads. + */ + uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; + uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */ + return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY; +} + +static inline uint32_t +brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo) +{ + /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids + * which includes all the threads. + */ + uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; + uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */ + return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY; +} + +static inline uint32_t +brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo, + uint32_t ray_queries) +{ + /* Don't bother a shadow stack if we only have a single query. We can + * directly write in the HW buffer. + */ + return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) + + ray_queries * 4; /* Ctrl + Level data */ +} + #ifdef __cplusplus } #endif diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 8668d082d83..8c3d55e11f3 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -87,6 +87,7 @@ libintel_compiler_files = files( 'brw_nir_lower_alpha_to_coverage.c', 'brw_nir_lower_intersection_shader.c', 'brw_nir_lower_mem_access_bit_sizes.c', + 'brw_nir_lower_ray_queries.c', 'brw_nir_lower_rt_intrinsics.c', 'brw_nir_lower_scoped_barriers.c', 'brw_nir_lower_shader_calls.c',