From b0858c1cc6711168087b6774f3dc02a73b14fed2 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 31 Aug 2017 21:45:30 -0700 Subject: [PATCH] intel/fs: Add a couple of simple helper opcodes Acked-by: Lionel Landwerlin Reviewed-by: Iago Toral Quiroga --- src/intel/compiler/brw_eu_defines.h | 13 +++++++ src/intel/compiler/brw_fs.cpp | 11 ++++++ src/intel/compiler/brw_fs_generator.cpp | 47 +++++++++++++++++++++++++ src/intel/compiler/brw_shader.cpp | 5 +++ 4 files changed, 76 insertions(+) diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 3449c73d771..3c4c538ac17 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -460,6 +460,19 @@ enum opcode { */ SHADER_OPCODE_SHUFFLE, + /* Select between src0 and src1 based on channel enables. + * + * This instruction copies src0 into the enabled channels of the + * destination and copies src1 into the disabled channels. + */ + SHADER_OPCODE_SEL_EXEC, + + /* Take every Nth element in src0 and broadcast it to the group of N + * channels in which it lives in the destination. The offset within the + * cluster is given by src1 and the cluster size is given by src2. + */ + SHADER_OPCODE_CLUSTER_BROADCAST, + SHADER_OPCODE_GET_BUFFER_SIZE, VEC4_OPCODE_MOV_BYTES, diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 47f1f6e9c9f..9f1b8d0b184 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -316,6 +316,15 @@ fs_inst::has_source_and_destination_hazard() const * that one of the instructions will read from a channel corresponding * to an earlier instruction. */ + case SHADER_OPCODE_SEL_EXEC: + /* This is implemented as + * + * mov(16) g4<1>D 0D { align1 WE_all 1H }; + * mov(16) g4<1>D g5<8,8,1>D { align1 1H } + * + * Because the source is only read in the second instruction, the first + * may stomp all over it. + */ return true; default: /* The SIMD16 compressed instruction @@ -5038,6 +5047,8 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case BRW_OPCODE_MAD: case BRW_OPCODE_LRP: case FS_OPCODE_PACK: + case SHADER_OPCODE_SEL_EXEC: + case SHADER_OPCODE_CLUSTER_BROADCAST: return get_fpu_lowered_simd_width(devinfo, inst); case BRW_OPCODE_CMP: { diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 9b8f8ce683e..382548f5c36 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2293,6 +2293,53 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) generate_shuffle(inst, dst, src[0], src[1]); break; + case SHADER_OPCODE_SEL_EXEC: + assert(inst->force_writemask_all); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, src[1]); + brw_set_default_mask_control(p, BRW_MASK_ENABLE); + brw_MOV(p, dst, src[0]); + break; + + case SHADER_OPCODE_CLUSTER_BROADCAST: { + assert(src[0].type == dst.type); + assert(!src[0].negate && !src[0].abs); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + assert(src[1].type == BRW_REGISTER_TYPE_UD); + assert(src[2].file == BRW_IMMEDIATE_VALUE); + assert(src[2].type == BRW_REGISTER_TYPE_UD); + const unsigned component = src[1].ud; + const unsigned cluster_size = src[2].ud; + struct brw_reg strided = stride(suboffset(src[0], component), + cluster_size, cluster_size, 0); + if (type_sz(src[0].type) > 4 && + (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { + /* IVB has an issue (which we found empirically) where it reads + * two address register components per channel for indirectly + * addressed 64-bit sources. + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be + * used." + * + * To work around both of these, we do two integer MOVs insead of + * one 64-bit MOV. Because no double value should ever cross a + * register boundary, it's safe to use the immediate offset in the + * indirect here to handle adding 4 bytes to the offset and avoid + * the extra ADD to the register file. + */ + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0), + subscript(strided, BRW_REGISTER_TYPE_D, 0)); + brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1), + subscript(strided, BRW_REGISTER_TYPE_D, 1)); + } else { + brw_MOV(p, dst, strided); + } + break; + } + case FS_OPCODE_SET_SAMPLE_ID: generate_set_sample_id(inst, dst, src[0], src[1]); break; diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index b1227e17e2c..e822c100e9f 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -332,6 +332,10 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "broadcast"; case SHADER_OPCODE_SHUFFLE: return "shuffle"; + case SHADER_OPCODE_SEL_EXEC: + return "sel_exec"; + case SHADER_OPCODE_CLUSTER_BROADCAST: + return "cluster_broadcast"; case SHADER_OPCODE_GET_BUFFER_SIZE: return "get_buffer_size"; @@ -847,6 +851,7 @@ backend_instruction::can_do_source_mods() const case BRW_OPCODE_FBL: case BRW_OPCODE_SUBB: case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_CLUSTER_BROADCAST: case SHADER_OPCODE_MOV_INDIRECT: return false; default: