diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 3fccd180cfb..974efcee8a3 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -574,6 +574,7 @@ enum opcode { VEC4_OPCODE_SET_LOW_32BIT, VEC4_OPCODE_SET_HIGH_32BIT, VEC4_OPCODE_MOV_FOR_SCRATCH, + VEC4_OPCODE_ZERO_OOB_PUSH_REGS, FS_OPCODE_DDX_COARSE, FS_OPCODE_DDX_FINE, diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index fa0aca6344b..a9f93609561 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -344,6 +344,7 @@ namespace { case VEC4_OPCODE_PICK_HIGH_32BIT: case VEC4_OPCODE_SET_LOW_32BIT: case VEC4_OPCODE_SET_HIGH_32BIT: + case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: case GS_OPCODE_SET_DWORD_2: case GS_OPCODE_SET_WRITE_OFFSET: case GS_OPCODE_SET_VERTEX_COUNT: diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index bc50595137c..4aeff34474b 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -423,6 +423,8 @@ brw_instruction_name(const struct intel_device_info *devinfo, enum opcode op) return "set_high_32bit"; case VEC4_OPCODE_MOV_FOR_SCRATCH: return "mov_for_scratch"; + case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: + return "zero_oob_push_regs"; case FS_OPCODE_DDX_COARSE: return "ddx_coarse"; @@ -1136,6 +1138,7 @@ backend_instruction::has_side_effects() const case SHADER_OPCODE_BTD_SPAWN_LOGICAL: case SHADER_OPCODE_BTD_RETIRE_LOGICAL: case RT_OPCODE_TRACE_RAY_LOGICAL: + case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: return true; default: return eot; diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index 062167e0436..4e215c88b0b 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -2698,6 +2698,20 @@ vec4_visitor::run() setup_push_ranges(); + if (prog_data->base.zero_push_reg) { + /* push_reg_mask_param is in uint32 params and UNIFORM is in vec4s */ + const unsigned mask_param = stage_prog_data->push_reg_mask_param; + src_reg mask = src_reg(dst_reg(UNIFORM, mask_param / 4)); + assert(mask_param % 2 == 0); /* Should be 64-bit-aligned */ + mask.swizzle = BRW_SWIZZLE4((mask_param + 0) % 4, + (mask_param + 1) % 4, + (mask_param + 0) % 4, + (mask_param + 1) % 4); + + emit(VEC4_OPCODE_ZERO_OOB_PUSH_REGS, + dst_reg(VGRF, alloc.allocate(3)), mask); + } + emit_prolog(); emit_nir_code(); diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp index 1039bf2927c..1379c99518c 100644 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -1464,6 +1464,57 @@ generate_mov_indirect(struct brw_codegen *p, } } +static void +generate_zero_oob_push_regs(struct brw_codegen *p, + struct brw_stage_prog_data *prog_data, + struct brw_reg scratch, + struct brw_reg bit_mask_in) +{ + const uint64_t want_zero = prog_data->zero_push_reg; + assert(want_zero); + + assert(bit_mask_in.file == BRW_GENERAL_REGISTER_FILE); + assert(BRW_GET_SWZ(bit_mask_in.swizzle, 1) == + BRW_GET_SWZ(bit_mask_in.swizzle, 0) + 1); + bit_mask_in.subnr += BRW_GET_SWZ(bit_mask_in.swizzle, 0) * 4; + bit_mask_in.type = BRW_REGISTER_TYPE_W; + + /* Scratch should be 3 registers in the GRF */ + assert(scratch.file == BRW_GENERAL_REGISTER_FILE); + scratch = vec8(scratch); + struct brw_reg mask_w16 = retype(scratch, BRW_REGISTER_TYPE_W); + struct brw_reg mask_d16 = retype(byte_offset(scratch, REG_SIZE), + BRW_REGISTER_TYPE_D); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + for (unsigned i = 0; i < 64; i++) { + if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) { + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_SHL(p, suboffset(mask_w16, 8), + vec1(byte_offset(bit_mask_in, i / 8)), + brw_imm_v(0x01234567)); + brw_SHL(p, mask_w16, suboffset(mask_w16, 8), brw_imm_w(8)); + + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_ASR(p, mask_d16, mask_w16, brw_imm_w(15)); + } + + if (want_zero & BITFIELD64_BIT(i)) { + unsigned push_start = prog_data->dispatch_grf_start_reg; + struct brw_reg push_reg = + retype(brw_vec8_grf(push_start + i, 0), BRW_REGISTER_TYPE_D); + + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_AND(p, push_reg, push_reg, vec1(suboffset(mask_d16, i))); + } + } + + brw_pop_insn_state(p); +} + static void generate_code(struct brw_codegen *p, const struct brw_compiler *compiler, @@ -2065,6 +2116,9 @@ generate_code(struct brw_codegen *p, break; } + case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: + generate_zero_oob_push_regs(p, &prog_data->base, dst, src[0]); + case TCS_OPCODE_URB_WRITE: generate_tcs_urb_write(p, inst, src[0]); send_count++;