diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index dd504cc25fb..c4dba558ecd 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1117,7 +1117,8 @@ brw_memory_fence(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src, enum opcode send_op, - bool stall); + bool stall, + unsigned bti); void brw_pixel_interpolator_query(struct brw_codegen *p, diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index e8ca7ff8b98..1d4c0b83c87 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -452,6 +452,17 @@ enum opcode { SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + /** + * Memory fence messages. + * + * Source 0: Must be register g0, used as header. + * Source 1: Immediate bool to indicate whether or not we need to stall + * until memory transactions prior to the fence are completed. + * Source 2: Immediate byte indicating which memory to fence. Zero means + * global memory; GEN7_BTI_SLM means SLM (for Gen11+ only). + * + * Vec4 backend only uses Source 0. + */ SHADER_OPCODE_MEMORY_FENCE, SHADER_OPCODE_GEN4_SCRATCH_READ, diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 8e7263ce447..60761e83c62 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -3012,7 +3012,8 @@ static void brw_set_memory_fence_message(struct brw_codegen *p, struct brw_inst *insn, enum brw_message_target sfid, - bool commit_enable) + bool commit_enable, + unsigned bti) { const struct gen_device_info *devinfo = p->devinfo; @@ -3034,6 +3035,9 @@ brw_set_memory_fence_message(struct brw_codegen *p, if (commit_enable) brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5); + + assert(devinfo->gen >= 11 || bti == 0); + brw_inst_set_binding_table_index(devinfo, insn, bti); } void @@ -3041,7 +3045,8 @@ brw_memory_fence(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src, enum opcode send_op, - bool stall) + bool stall, + unsigned bti) { const struct gen_device_info *devinfo = p->devinfo; const bool commit_enable = stall || @@ -3062,7 +3067,7 @@ brw_memory_fence(struct brw_codegen *p, brw_set_dest(p, insn, dst); brw_set_src0(p, insn, src); brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE, - commit_enable); + commit_enable, bti); if (devinfo->gen == 7 && !devinfo->is_haswell) { /* IVB does typed surface access through the render cache, so we need to @@ -3073,7 +3078,7 @@ brw_memory_fence(struct brw_codegen *p, brw_set_dest(p, insn, offset(dst, 1)); brw_set_src0(p, insn, src); brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE, - commit_enable); + commit_enable, bti); /* Now write the response of the second message into the response of the * first to trigger a pipeline stall -- This way future render and data diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index be6a00e8476..88de5189064 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2069,13 +2069,14 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case SHADER_OPCODE_MEMORY_FENCE: assert(src[1].file == BRW_IMMEDIATE_VALUE); - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud); + assert(src[2].file == BRW_IMMEDIATE_VALUE); + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, src[2].ud); break; case SHADER_OPCODE_INTERLOCK: assert(devinfo->gen >= 9); /* The interlock is basically a memory fence issued via sendc */ - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false); + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0); break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 00ce6af23c7..aeebaaeb62c 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4416,11 +4416,47 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier: { + bool l3_fence, slm_fence; + if (devinfo->gen >= 11) { + l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; + slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier_shared; + } else { + /* Prior to gen11, we only have one kind of fence. */ + l3_fence = true; + slm_fence = false; + } + + /* Be conservative in Gen11+ and always stall in a fence. Since there + * are two different fences, and shader might want to synchronize + * between them. + * + * TODO: Improve NIR so that scope and visibility information for the + * barriers is available here to make a better decision. + * + * TODO: When emitting more than one fence, it might help emit all + * the fences first and then generate the stall moves. + */ + const bool stall = devinfo->gen >= 11; + const fs_builder ubld = bld.group(8, 0); const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(0)) - ->size_written = 2 * REG_SIZE; + + if (l3_fence) { + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, + brw_vec8_grf(0, 0), brw_imm_ud(stall), + /* bti */ brw_imm_ud(0)) + ->size_written = 2 * REG_SIZE; + } + + if (slm_fence) { + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, + brw_vec8_grf(0, 0), brw_imm_ud(stall), + brw_imm_ud(GEN7_BTI_SLM)) + ->size_written = 2 * REG_SIZE; + } + break; } @@ -5238,7 +5274,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr const fs_builder ubld = bld.group(8, 0); const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, - brw_vec8_grf(0, 0), brw_imm_ud(1)) + brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0)) ->size_written = 2 * REG_SIZE; break; } diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp index 8f9e4f16677..d85e3c43241 100644 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -1886,7 +1886,7 @@ generate_code(struct brw_codegen *p, break; case SHADER_OPCODE_MEMORY_FENCE: - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false); + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0); break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: {