r600/sfn: Add lowering UBO access to r600 specific codes

r600 reads vec4 from the UBO, but the offsets in nir are evaluated to the component. If the offsets are not literal then all non-vec4 reads must resolve the component after reading a vec4 component (TODO: figure out whether there is a consistent way to deduct the component that is actually read). Signed-off-by: Gert Wollny <gert.wollny@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3225>
2019-12-27 17:49:27 +01:00 · 2019-12-27 17:49:27 +01:00 · 37125b7cc2
parent 32d3435a78
commit 37125b7cc2
4 changed files with 115 additions and 2 deletions
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -866,6 +866,14 @@ load("output_u8_as_fp16_pan", 0, [], [CAN_ELIMINATE, CAN_REORDER])
 # src[] = { sampler_index }
 load("sampler_lod_parameters_pan", 1, [CAN_ELIMINATE, CAN_REORDER])

+# R600 specific instrincs
+#
+# R600 can only fetch 16 byte aligned data from an UBO, and the actual offset
+# is given in vec4 units, so we have to fetch the a vec4 and get the component
+# later
+# src[] = { buffer_index, offset }.
+load("ubo_r600", 2, [ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE, CAN_REORDER])
+
 # V3D-specific instrinc for tile buffer color reads.
 #
 # The hardware requires that we read the samples and components of a pixel
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@ -531,6 +531,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)

   case nir_intrinsic_load_deref: {
      nir_deref_instr *src = nir_src_as_deref(instr->src[0]);
+      assert(src);
      validate_assert(state, glsl_type_is_vector_or_scalar(src->type) ||
                      (src->mode == nir_var_uniform &&
                       glsl_get_base_type(src->type) == GLSL_TYPE_SUBROUTINE));
@ -545,6 +546,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)

   case nir_intrinsic_store_deref: {
      nir_deref_instr *dst = nir_src_as_deref(instr->src[0]);
+      assert(dst);
      validate_assert(state, glsl_type_is_vector_or_scalar(dst->type));
      validate_assert(state, instr->num_components ==
                             glsl_get_vector_elements(dst->type));
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@ -386,12 +386,112 @@ bool r600_lower_scratch_addresses(nir_shader *shader)
   return progress;
 }

+static nir_ssa_def *
+r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options)
+{
+   b->cursor = nir_before_instr(instr);
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   assert(op->intrinsic == nir_intrinsic_load_ubo);
+
+   bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0]));
+
+   nir_ssa_def *offset = op->src[1].ssa;
+
+   /* This is ugly: With const addressing we can actually set a proper fetch target mask,
+    * but for this we need the component encoded, we don't shift and do de decoding in the
+    * backend. Otherwise we shift by four and resolve the component here
+    * (TODO: encode the start component in the intrinsic when the offset base is non-constant
+    * but a multiple of 16 */
+
+   nir_ssa_def *new_offset = offset;
+   if (!const_address)
+      new_offset = nir_ishr(b, offset,  nir_imm_int(b, 4));
+
+   nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600);
+   load->num_components = const_address ? op->num_components : 4;
+   load->src[0] = op->src[0];
+   load->src[1] = nir_src_for_ssa(new_offset);
+   nir_intrinsic_set_align(load, nir_intrinsic_align(op), nir_intrinsic_align_offset(op));
+
+   nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL);
+   nir_builder_instr_insert(b, &load->instr);
+
+   /* when four components are loaded or both the offset and the location
+    * are constant, then the backend can deal with it better */
+   if (op->num_components == 4 || const_address)
+      return &load->dest.ssa;
+
+   /* What comes below is a performance disaster when the offset is not constant
+    * because then we have to assume that any component can be the first one and we
+    * have to pick the result manually. */
+   nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset,  nir_imm_int(b, 2)),
+                                     nir_imm_int(b,3));
+
+   const unsigned swz_000[4] = {0, 0, 0, 0};
+   nir_ssa_def *component_select = nir_ieq(b, nir_imm_ivec4(b, 0, 1, 2, 3),
+                                           nir_swizzle(b, first_comp, swz_000, 4));
+
+   const unsigned szw_0[1] = {0};
+   const unsigned szw_1[1] = {1};
+   const unsigned szw_2[1] = {2};
+
+   if (op->num_components == 1) {
+      const unsigned szw_3[1] = {3};
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_3, 1));
+      nir_ssa_def *check1 = nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_1, 1),
+                                      check0);
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_2, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_2, 1),
+                       check1);
+   } else if (op->num_components == 2) {
+      const unsigned szw_01[2] = {0, 1};
+      const unsigned szw_12[2] = {1, 2};
+      const unsigned szw_23[2] = {2, 3};
+
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_01, 2),
+                                      nir_swizzle(b, &load->dest.ssa, szw_23, 2));
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_12, 2),
+                                      check0);
+   } else {
+      const unsigned szw_012[3] = {0, 1, 3};
+      const unsigned szw_123[3] = {1, 2, 3};
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_012, 3),
+                       nir_swizzle(b, &load->dest.ssa, szw_123, 3));
+   }
+}
+
+bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   return op->intrinsic == nir_intrinsic_load_ubo;
+}
+
+
+bool r600_lower_ubo_to_align16(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(shader,
+                                        r600_lower_ubo_to_align16_filter,
+                                        r600_lower_ubo_to_align16_impl,
+                                        nullptr);
+}
+
 }

 using r600::r600_nir_lower_int_tg4;
 using r600::r600_nir_lower_pack_unpack_2x16;
 using r600::r600_lower_scratch_addresses;
 using r600::r600_lower_fs_out_to_vector;
+using r600::r600_lower_ubo_to_align16;

 int
 r600_glsl_type_size(const struct glsl_type *type, bool is_bindless)
@ -512,7 +612,10 @@ int r600_shader_from_nir(struct r600_context *rctx,
   const nir_function *func = reinterpret_cast<const nir_function *>(exec_list_get_head_const(&sel->nir->functions));
   bool optimize = func->impl->registers.length() == 0 && !has_saturate(func);

-
+   if (optimize) {
+      optimize_once(sel->nir);
+      NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16);
+   }
   /* It seems the output of this optimization is cached somewhere, and
    * when there are registers, then we can no longer copy propagate, so
    * skip the optimization then. (There is probably a better way, but yeah)
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
@ -447,7 +447,7 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins
   case nir_intrinsic_discard:
   case nir_intrinsic_discard_if:
      return emit_discard_if(instr);
-   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_r600:
      return emit_load_ubo(instr);
   case nir_intrinsic_copy_deref:
   case nir_intrinsic_load_constant: