diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 90b365c341d..c131dcb384d 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1096,3 +1096,9 @@ system_value("btd_local_arg_addr_intel", 1, bit_sizes=[64]) intrinsic("btd_spawn_intel", src_comp=[1, 1]) # src[] = { } intrinsic("btd_retire_intel") + +# System values used for ray-tracing on Intel +system_value("ray_base_mem_addr_intel", 1, bit_sizes=[64]) +system_value("ray_hw_stack_size_intel", 1) +system_value("ray_sw_stack_size_intel", 1) +system_value("ray_num_dss_rt_stacks_intel", 1) diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 3459161e91f..6b9300ddd91 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -96,6 +96,7 @@ COMPILER_FILES = \ compiler/brw_nir_lower_mem_access_bit_sizes.c \ compiler/brw_nir_lower_scoped_barriers.c \ compiler/brw_nir_opt_peephole_ffma.c \ + compiler/brw_nir_rt_builder.h \ compiler/brw_nir_tcs_workarounds.c \ compiler/brw_packed_float.c \ compiler/brw_predicated_break.cpp \ diff --git a/src/intel/compiler/brw_nir_rt_builder.h b/src/intel/compiler/brw_nir_rt_builder.h new file mode 100644 index 00000000000..14c1cd21e3f --- /dev/null +++ b/src/intel/compiler/brw_nir_rt_builder.h @@ -0,0 +1,437 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_NIR_RT_BUILDER_H +#define BRW_NIR_RT_BUILDER_H + +#include "brw_rt.h" +#include "nir_builder.h" + +static nir_ssa_def * +nir_load_global_const_block_intel(nir_builder *b, nir_ssa_def *addr, + unsigned num_components) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_global_const_block_intel); + load->src[0] = nir_src_for_ssa(addr); + load->num_components = num_components; + nir_ssa_dest_init(&load->instr, &load->dest, num_components, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +static inline void +assert_def_size(nir_ssa_def *def, unsigned num_components, unsigned bit_size) +{ + assert(def->num_components == num_components); + assert(def->bit_size == bit_size); +} + +static inline nir_ssa_def * +brw_nir_num_rt_stacks(nir_builder *b, + const struct gen_device_info *devinfo) +{ + return nir_imul_imm(b, nir_load_ray_num_dss_rt_stacks_intel(b), + gen_device_info_num_dual_subslices(devinfo)); +} + +static inline nir_ssa_def * +brw_nir_rt_stack_id(nir_builder *b) +{ + return nir_iadd(b, nir_umul_32x16(b, nir_load_ray_num_dss_rt_stacks_intel(b), + nir_load_btd_dss_id_intel(b)), + nir_load_btd_stack_id_intel(b)); +} + +static inline nir_ssa_def * +brw_nir_rt_sw_hotzone_addr(nir_builder *b, + const struct gen_device_info *devinfo) +{ + nir_ssa_def *offset32 = + nir_imul_imm(b, brw_nir_rt_stack_id(b), BRW_RT_SIZEOF_HOTZONE); + + offset32 = nir_iadd(b, offset32, nir_ineg(b, + nir_imul_imm(b, brw_nir_num_rt_stacks(b, devinfo), + BRW_RT_SIZEOF_HOTZONE))); + + return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b), + nir_i2i64(b, offset32)); +} + +static inline nir_ssa_def * +brw_nir_rt_ray_addr(nir_builder *b) +{ + /* From the BSpec "Address Computation for Memory Based Data Structures: + * Ray and TraversalStack (Async Ray Tracing)": + * + * stackBase = RTDispatchGlobals.rtMemBasePtr + * + (DSSID * RTDispatchGlobals.numDSSRTStacks + stackID) + * * RTDispatchGlobals.stackSizePerRay // 64B aligned + * + * We assume that we can calculate a 32-bit offset first and then add it + * to the 64-bit base address at the end. + */ + nir_ssa_def *offset32 = + nir_imul(b, brw_nir_rt_stack_id(b), + nir_load_ray_hw_stack_size_intel(b)); + return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b), + nir_u2u64(b, offset32)); +} + +static inline nir_ssa_def * +brw_nir_rt_mem_hit_addr(nir_builder *b, bool committed) +{ + return nir_iadd_imm(b, brw_nir_rt_ray_addr(b), + committed ? 0 : BRW_RT_SIZEOF_HIT_INFO); +} + +static inline nir_ssa_def * +brw_nir_rt_hit_attrib_data_addr(nir_builder *b) +{ + return nir_iadd_imm(b, brw_nir_rt_ray_addr(b), + BRW_RT_OFFSETOF_HIT_ATTRIB_DATA); +} + +static inline nir_ssa_def * +brw_nir_rt_mem_ray_addr(nir_builder *b, + enum brw_rt_bvh_level bvh_level) +{ + /* From the BSpec "Address Computation for Memory Based Data Structures: + * Ray and TraversalStack (Async Ray Tracing)": + * + * rayBase = stackBase + sizeof(HitInfo) * 2 // 64B aligned + * rayPtr = rayBase + bvhLevel * sizeof(Ray); // 64B aligned + * + * In Vulkan, we always have exactly two levels of BVH: World and Object. + */ + uint32_t offset = BRW_RT_SIZEOF_HIT_INFO * 2 + + bvh_level * BRW_RT_SIZEOF_RAY; + return nir_iadd_imm(b, brw_nir_rt_ray_addr(b), offset); +} + +static inline nir_ssa_def * +brw_nir_rt_sw_stack_addr(nir_builder *b, + const struct gen_device_info *devinfo) +{ + nir_ssa_def *addr = nir_load_ray_base_mem_addr_intel(b); + + nir_ssa_def *offset32 = nir_imul(b, brw_nir_num_rt_stacks(b, devinfo), + nir_load_ray_hw_stack_size_intel(b)); + addr = nir_iadd(b, addr, nir_u2u64(b, offset32)); + + return nir_iadd(b, addr, + nir_imul(b, nir_u2u64(b, brw_nir_rt_stack_id(b)), + nir_u2u64(b, nir_load_ray_sw_stack_size_intel(b)))); +} + +static inline nir_ssa_def * +nir_unpack_64_4x16_split_z(nir_builder *b, nir_ssa_def *val) +{ + return nir_unpack_32_2x16_split_x(b, nir_unpack_64_2x32_split_y(b, val)); +} + +struct brw_nir_rt_globals_defs { + nir_ssa_def *base_mem_addr; + nir_ssa_def *call_stack_handler_addr; + nir_ssa_def *hw_stack_size; + nir_ssa_def *num_dss_rt_stacks; + nir_ssa_def *hit_sbt_addr; + nir_ssa_def *hit_sbt_stride; + nir_ssa_def *miss_sbt_addr; + nir_ssa_def *miss_sbt_stride; + nir_ssa_def *sw_stack_size; + nir_ssa_def *launch_size; + nir_ssa_def *call_sbt_addr; + nir_ssa_def *call_sbt_stride; + nir_ssa_def *resume_sbt_addr; +}; + +static inline void +brw_nir_rt_load_globals(nir_builder *b, + struct brw_nir_rt_globals_defs *defs) +{ + nir_ssa_def *addr = nir_load_btd_global_arg_addr_intel(b); + + nir_ssa_def *data; + data = nir_load_global_const_block_intel(b, addr, 16); + defs->base_mem_addr = nir_pack_64_2x32(b, nir_channels(b, data, 0x3)); + + defs->call_stack_handler_addr = + nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2)); + + defs->hw_stack_size = nir_channel(b, data, 4); + defs->num_dss_rt_stacks = nir_iand_imm(b, nir_channel(b, data, 5), 0xffff); + defs->hit_sbt_addr = + nir_pack_64_2x32_split(b, nir_channel(b, data, 8), + nir_extract_i16(b, nir_channel(b, data, 9), + nir_imm_int(b, 0))); + defs->hit_sbt_stride = + nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 9)); + defs->miss_sbt_addr = + nir_pack_64_2x32_split(b, nir_channel(b, data, 10), + nir_extract_i16(b, nir_channel(b, data, 11), + nir_imm_int(b, 0))); + defs->miss_sbt_stride = + nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 11)); + defs->sw_stack_size = nir_channel(b, data, 12); + defs->launch_size = nir_channels(b, data, 0x7u << 13); + + data = nir_load_global_const_block_intel(b, nir_iadd_imm(b, addr, 64), 8); + defs->call_sbt_addr = + nir_pack_64_2x32_split(b, nir_channel(b, data, 0), + nir_extract_i16(b, nir_channel(b, data, 1), + nir_imm_int(b, 0))); + defs->call_sbt_stride = + nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1)); + + defs->resume_sbt_addr = + nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2)); +} + +static inline nir_ssa_def * +brw_nir_rt_unpack_leaf_ptr(nir_builder *b, nir_ssa_def *vec2) +{ + /* Hit record leaf pointers are 42-bit and assumed to be in 64B chunks. + * This leaves 22 bits at the top for other stuff. + */ + nir_ssa_def *ptr64 = nir_imul_imm(b, nir_pack_64_2x32(b, vec2), 64); + + /* The top 16 bits (remember, we shifted by 6 already) contain garbage + * that we need to get rid of. + */ + nir_ssa_def *ptr_lo = nir_unpack_64_2x32_split_x(b, ptr64); + nir_ssa_def *ptr_hi = nir_unpack_64_2x32_split_y(b, ptr64); + ptr_hi = nir_extract_i16(b, ptr_hi, nir_imm_int(b, 0)); + return nir_pack_64_2x32_split(b, ptr_lo, ptr_hi); +} + +struct brw_nir_rt_mem_hit_defs { + nir_ssa_def *t; + nir_ssa_def *tri_bary; /**< Only valid for triangle geometry */ + nir_ssa_def *aabb_hit_kind; /**< Only valid for AABB geometry */ + nir_ssa_def *leaf_type; + nir_ssa_def *prim_leaf_index; + nir_ssa_def *front_face; + nir_ssa_def *prim_leaf_ptr; + nir_ssa_def *inst_leaf_ptr; +}; + +static inline void +brw_nir_rt_load_mem_hit(nir_builder *b, + struct brw_nir_rt_mem_hit_defs *defs, + bool committed) +{ + nir_ssa_def *hit_addr = brw_nir_rt_mem_hit_addr(b, committed); + + nir_ssa_def *data = nir_load_global(b, hit_addr, 16, 4, 32); + defs->t = nir_channel(b, data, 0); + defs->aabb_hit_kind = nir_channel(b, data, 1); + defs->tri_bary = nir_channels(b, data, 0x6); + nir_ssa_def *bitfield = nir_channel(b, data, 3); + defs->leaf_type = + nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 17), nir_imm_int(b, 3)); + defs->prim_leaf_index = + nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 20), nir_imm_int(b, 4)); + defs->front_face = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 27)); + + data = nir_load_global(b, nir_iadd_imm(b, hit_addr, 16), 16, 4, 32); + defs->prim_leaf_ptr = + brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 0)); + defs->inst_leaf_ptr = + brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 2)); +} + +struct brw_nir_rt_mem_ray_defs { + nir_ssa_def *orig; + nir_ssa_def *dir; + nir_ssa_def *t_near; + nir_ssa_def *t_far; + nir_ssa_def *root_node_ptr; + nir_ssa_def *ray_flags; + nir_ssa_def *hit_group_sr_base_ptr; + nir_ssa_def *hit_group_sr_stride; + nir_ssa_def *miss_sr_ptr; + nir_ssa_def *shader_index_multiplier; + nir_ssa_def *inst_leaf_ptr; + nir_ssa_def *ray_mask; +}; + +static inline void +brw_nir_rt_store_mem_ray(nir_builder *b, + const struct brw_nir_rt_mem_ray_defs *defs, + enum brw_rt_bvh_level bvh_level) +{ + nir_ssa_def *ray_addr = brw_nir_rt_mem_ray_addr(b, bvh_level); + + assert_def_size(defs->orig, 3, 32); + assert_def_size(defs->dir, 3, 32); + nir_store_global(b, nir_iadd_imm(b, ray_addr, 0), 16, + nir_vec4(b, nir_channel(b, defs->orig, 0), + nir_channel(b, defs->orig, 1), + nir_channel(b, defs->orig, 2), + nir_channel(b, defs->dir, 0)), + ~0 /* write mask */); + + assert_def_size(defs->t_near, 1, 32); + assert_def_size(defs->t_far, 1, 32); + nir_store_global(b, nir_iadd_imm(b, ray_addr, 16), 16, + nir_vec4(b, nir_channel(b, defs->dir, 1), + nir_channel(b, defs->dir, 2), + defs->t_near, + defs->t_far), + ~0 /* write mask */); + + assert_def_size(defs->root_node_ptr, 1, 64); + assert_def_size(defs->ray_flags, 1, 16); + assert_def_size(defs->hit_group_sr_base_ptr, 1, 64); + assert_def_size(defs->hit_group_sr_stride, 1, 16); + nir_store_global(b, nir_iadd_imm(b, ray_addr, 32), 16, + nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr), + nir_pack_32_2x16_split(b, + nir_unpack_64_4x16_split_z(b, defs->root_node_ptr), + defs->ray_flags), + nir_unpack_64_2x32_split_x(b, defs->hit_group_sr_base_ptr), + nir_pack_32_2x16_split(b, + nir_unpack_64_4x16_split_z(b, defs->hit_group_sr_base_ptr), + defs->hit_group_sr_stride)), + ~0 /* write mask */); + + /* leaf_ptr is optional */ + nir_ssa_def *inst_leaf_ptr; + if (defs->inst_leaf_ptr) { + inst_leaf_ptr = defs->inst_leaf_ptr; + } else { + inst_leaf_ptr = nir_imm_int64(b, 0); + } + + assert_def_size(defs->miss_sr_ptr, 1, 64); + assert_def_size(defs->shader_index_multiplier, 1, 32); + assert_def_size(inst_leaf_ptr, 1, 64); + assert_def_size(defs->ray_mask, 1, 32); + nir_store_global(b, nir_iadd_imm(b, ray_addr, 48), 16, + nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->miss_sr_ptr), + nir_pack_32_2x16_split(b, + nir_unpack_64_4x16_split_z(b, defs->miss_sr_ptr), + nir_unpack_32_2x16_split_x(b, + nir_ishl(b, defs->shader_index_multiplier, + nir_imm_int(b, 8)))), + nir_unpack_64_2x32_split_x(b, inst_leaf_ptr), + nir_pack_32_2x16_split(b, + nir_unpack_64_4x16_split_z(b, inst_leaf_ptr), + nir_unpack_32_2x16_split_x(b, defs->ray_mask))), + ~0 /* write mask */); +} + +static inline void +brw_nir_rt_load_mem_ray(nir_builder *b, + struct brw_nir_rt_mem_ray_defs *defs, + enum brw_rt_bvh_level bvh_level) +{ + nir_ssa_def *ray_addr = brw_nir_rt_mem_ray_addr(b, bvh_level); + + nir_ssa_def *data[4] = { + nir_load_global(b, nir_iadd_imm(b, ray_addr, 0), 16, 4, 32), + nir_load_global(b, nir_iadd_imm(b, ray_addr, 16), 16, 4, 32), + nir_load_global(b, nir_iadd_imm(b, ray_addr, 32), 16, 4, 32), + nir_load_global(b, nir_iadd_imm(b, ray_addr, 48), 16, 4, 32), + }; + + defs->orig = nir_channels(b, data[0], 0x7); + defs->dir = nir_vec3(b, nir_channel(b, data[0], 3), + nir_channel(b, data[1], 0), + nir_channel(b, data[1], 1)); + defs->t_near = nir_channel(b, data[1], 2); + defs->t_far = nir_channel(b, data[1], 3); + defs->root_node_ptr = + nir_pack_64_2x32_split(b, nir_channel(b, data[2], 0), + nir_extract_i16(b, nir_channel(b, data[2], 1), + nir_imm_int(b, 0))); + defs->ray_flags = + nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 1)); + defs->hit_group_sr_base_ptr = + nir_pack_64_2x32_split(b, nir_channel(b, data[2], 2), + nir_extract_i16(b, nir_channel(b, data[2], 3), + nir_imm_int(b, 0))); + defs->hit_group_sr_stride = + nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 3)); + defs->miss_sr_ptr = + nir_pack_64_2x32_split(b, nir_channel(b, data[3], 0), + nir_extract_i16(b, nir_channel(b, data[3], 1), + nir_imm_int(b, 0))); + defs->shader_index_multiplier = + nir_ushr(b, nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 1)), + nir_imm_int(b, 8)); + defs->inst_leaf_ptr = + nir_pack_64_2x32_split(b, nir_channel(b, data[3], 2), + nir_extract_i16(b, nir_channel(b, data[3], 3), + nir_imm_int(b, 0))); + defs->ray_mask = + nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 3)); +} + +struct brw_nir_rt_bvh_instance_leaf_defs { + nir_ssa_def *world_to_object[4]; + nir_ssa_def *instance_id; + nir_ssa_def *instance_index; + nir_ssa_def *object_to_world[4]; +}; + +static inline void +brw_nir_rt_load_bvh_instance_leaf(nir_builder *b, + struct brw_nir_rt_bvh_instance_leaf_defs *defs, + nir_ssa_def *leaf_addr) +{ + /* We don't care about the first 16B of the leaf for now. One day, we may + * add code to decode it but none of that data is directly required for + * implementing any ray-tracing built-ins. + */ + + defs->world_to_object[0] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 16), 4, 3, 32); + defs->world_to_object[1] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 28), 4, 3, 32); + defs->world_to_object[2] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 40), 4, 3, 32); + /* The last column of the matrices is swapped between the two probably + * because it makes it easier/faster for hardware somehow. + */ + defs->object_to_world[3] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 52), 4, 3, 32); + + nir_ssa_def *data = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 64), 4, 4, 32); + defs->instance_id = nir_channel(b, data, 2); + defs->instance_index = nir_channel(b, data, 3); + + defs->object_to_world[0] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 80), 4, 3, 32); + defs->object_to_world[1] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 92), 4, 3, 32); + defs->object_to_world[2] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 104), 4, 3, 32); + defs->world_to_object[3] = + nir_load_global(b, nir_iadd_imm(b, leaf_addr, 116), 4, 3, 32); +} + +#endif /* BRW_NIR_RT_BUILDER_H */ diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index d554ea06bf8..8f00106a05d 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -87,6 +87,7 @@ libintel_compiler_files = files( 'brw_nir_lower_mem_access_bit_sizes.c', 'brw_nir_lower_scoped_barriers.c', 'brw_nir_opt_peephole_ffma.c', + 'brw_nir_rt_builder.h', 'brw_nir_tcs_workarounds.c', 'brw_nir_clamp_image_1d_2d_array_sizes.c', 'brw_packed_float.c',