intel/fs: Add a couple of simple helper opcodes

Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
2017-08-31 21:45:30 -07:00 · 2017-08-31 21:45:30 -07:00 · b0858c1cc6
parent 57bff0a546
commit b0858c1cc6
4 changed files with 76 additions and 0 deletions
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@ -460,6 +460,19 @@ enum opcode {
    */
   SHADER_OPCODE_SHUFFLE,
   /* Select between src0 and src1 based on channel enables.
    *
    * This instruction copies src0 into the enabled channels of the
    * destination and copies src1 into the disabled channels.
    */
   SHADER_OPCODE_SEL_EXEC,
   /* Take every Nth element in src0 and broadcast it to the group of N
    * channels in which it lives in the destination.  The offset within the
    * cluster is given by src1 and the cluster size is given by src2.
    */
   SHADER_OPCODE_CLUSTER_BROADCAST,
   SHADER_OPCODE_GET_BUFFER_SIZE,
   VEC4_OPCODE_MOV_BYTES,
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -316,6 +316,15 @@ fs_inst::has_source_and_destination_hazard() const
       * that one of the instructions will read from a channel corresponding
       * to an earlier instruction.
       */
   case SHADER_OPCODE_SEL_EXEC:
      /* This is implemented as
       *
       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
       *
       * Because the source is only read in the second instruction, the first
       * may stomp all over it.
       */
      return true;
   default:
      /* The SIMD16 compressed instruction
@ -5038,6 +5047,8 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
   case BRW_OPCODE_MAD:
   case BRW_OPCODE_LRP:
   case FS_OPCODE_PACK:
   case SHADER_OPCODE_SEL_EXEC:
   case SHADER_OPCODE_CLUSTER_BROADCAST:
      return get_fpu_lowered_simd_width(devinfo, inst);
   case BRW_OPCODE_CMP: {
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@ -2293,6 +2293,53 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
         generate_shuffle(inst, dst, src[0], src[1]);
         break;
      case SHADER_OPCODE_SEL_EXEC:
         assert(inst->force_writemask_all);
         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
         brw_MOV(p, dst, src[1]);
         brw_set_default_mask_control(p, BRW_MASK_ENABLE);
         brw_MOV(p, dst, src[0]);
         break;
      case SHADER_OPCODE_CLUSTER_BROADCAST: {
         assert(src[0].type == dst.type);
         assert(!src[0].negate && !src[0].abs);
         assert(src[1].file == BRW_IMMEDIATE_VALUE);
         assert(src[1].type == BRW_REGISTER_TYPE_UD);
         assert(src[2].file == BRW_IMMEDIATE_VALUE);
         assert(src[2].type == BRW_REGISTER_TYPE_UD);
         const unsigned component = src[1].ud;
         const unsigned cluster_size = src[2].ud;
         struct brw_reg strided = stride(suboffset(src[0], component),
                                         cluster_size, cluster_size, 0);
         if (type_sz(src[0].type) > 4 &&
             (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
            /* IVB has an issue (which we found empirically) where it reads
             * two address register components per channel for indirectly
             * addressed 64-bit sources.
             *
             * From the Cherryview PRM Vol 7. "Register Region Restrictions":
             *
             *    "When source or destination datatype is 64b or operation is
             *    integer DWord multiply, indirect addressing must not be
             *    used."
             *
             * To work around both of these, we do two integer MOVs insead of
             * one 64-bit MOV.  Because no double value should ever cross a
             * register boundary, it's safe to use the immediate offset in the
             * indirect here to handle adding 4 bytes to the offset and avoid
             * the extra ADD to the register file.
             */
            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
                       subscript(strided, BRW_REGISTER_TYPE_D, 0));
            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
                       subscript(strided, BRW_REGISTER_TYPE_D, 1));
         } else {
            brw_MOV(p, dst, strided);
         }
         break;
      }
      case FS_OPCODE_SET_SAMPLE_ID:
         generate_set_sample_id(inst, dst, src[0], src[1]);
         break;
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@ -332,6 +332,10 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
      return "broadcast";
   case SHADER_OPCODE_SHUFFLE:
      return "shuffle";
   case SHADER_OPCODE_SEL_EXEC:
      return "sel_exec";
   case SHADER_OPCODE_CLUSTER_BROADCAST:
      return "cluster_broadcast";
   case SHADER_OPCODE_GET_BUFFER_SIZE:
      return "get_buffer_size";
@ -847,6 +851,7 @@ backend_instruction::can_do_source_mods() const
   case BRW_OPCODE_FBL:
   case BRW_OPCODE_SUBB:
   case SHADER_OPCODE_BROADCAST:
   case SHADER_OPCODE_CLUSTER_BROADCAST:
   case SHADER_OPCODE_MOV_INDIRECT:
      return false;
   default: