intel/fs: Emit better code for u2u of extract

Emitting the instructions one by one results in two MOV instructions that won't be propagated. By handling both instructions at once, a single MOV is emitted. For example, on Ice Lake this helps dEQP-VK.spirv_assembly.type.vec3.i8.bitwise_xor_frag: SIMD8 shader: 49 instructions. 1 loops. 4044 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 41 instructions. 1 loops. 3804 cycles. 0:0 spills:fills, 5 sends Without "intel/fs: Allow copy propagation between MOVs of mixed sizes," the improvement is still 8 instructions, but there are more instructions to begin with: SIMD8 shader: 52 instructions. 1 loops. 4164 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 44 instructions. 1 loops. 3944 cycles. 0:0 spills:fills, 5 sends Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9025>
2021-01-26 19:52:50 -08:00 · 2021-01-26 19:52:50 -08:00 · 7c83aa0518
parent e3f502e007
commit 7c83aa0518
1 changed files with 38 additions and 4 deletions
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@ -1102,13 +1102,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
   case nir_op_f2i32:
   case nir_op_f2u32:
   case nir_op_i2f16:
   case nir_op_i2i16:
   case nir_op_u2f16:
   case nir_op_u2u16:
   case nir_op_f2i16:
   case nir_op_f2u16:
   case nir_op_i2i8:
   case nir_op_u2u8:
   case nir_op_f2i8:
   case nir_op_f2u8:
      if (result.type == BRW_REGISTER_TYPE_B ||
@ -1124,6 +1120,44 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
      inst = bld.MOV(result, op[0]);
      break;
   case nir_op_i2i8:
   case nir_op_u2u8:
      assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
      FALLTHROUGH;
   case nir_op_i2i16:
   case nir_op_u2u16: {
      /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
       * Emitting the instructions one by one results in two MOV instructions
       * that won't be propagated.  By handling both instructions here, a
       * single MOV is emitted.
       */
      nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
      if (extract_instr != NULL) {
         if (extract_instr->op == nir_op_extract_u8 ||
             extract_instr->op == nir_op_extract_i8) {
            prepare_alu_destination_and_sources(bld, extract_instr, op, false);
            const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
            const brw_reg_type type =
               brw_int_type(1, extract_instr->op == nir_op_extract_i8);
            op[0] = subscript(op[0], type, byte);
         } else if (extract_instr->op == nir_op_extract_u16 ||
                    extract_instr->op == nir_op_extract_i16) {
            prepare_alu_destination_and_sources(bld, extract_instr, op, false);
            const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
            const brw_reg_type type =
               brw_int_type(2, extract_instr->op == nir_op_extract_i16);
            op[0] = subscript(op[0], type, word);
         }
      }
      inst = bld.MOV(result, op[0]);
      break;
   }
   case nir_op_fsat:
      inst = bld.MOV(result, op[0]);
      inst->saturate = true;