intel/fs: Rework fence handling in brw_fs_nir.cpp

Start off making everything look like LSC where we have three types of
fences: TGM, UGM, and SLM.  Then, emit the actual code in a generation-
aware way.  There are three HW generation cases we care about:
XeHP+ (LSC), ICL-TGL, and IVB-SKL.  Even though it looks like there's a
lot to deduplicate, it only increases the number of ubld.emit() calls
from 5 to 7 and entirely gets rid of the SFID juggling and other
weirdness we've introduced along the way to make those cases "general".
While we're here, also clean up the code for stalling after fences and
clearly document every case where we insert a stall.

There are only three known functional changes from this commit:

 1. We now avoid the render cache fence on IVB if we don't need image
    barriers.

 2. On ICL+, we no longer unconditionally stall on barriers.  We still
    stall if we have more than one to help tie them together but
    independent barriers are independent.  Barrier instructions will
    still operate in write-commit mode and still be scheduling barriers
    but won't necessarily stall.

 3. We now assert-fail for URB fences on LSC platforms.  We'll be adding
    in the new URB fence message for those platforms in a follow-on
    commit.

It is a big enough refactor, however, that other minor changes may be
present.

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
This commit is contained in:
Jason Ekstrand 2021-09-15 12:58:04 -05:00 committed by Marge Bot
parent 690cc3bb80
commit f726246297
1 changed files with 108 additions and 88 deletions

View File

@ -4219,6 +4219,21 @@ increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
}
}
static fs_reg
emit_fence(const fs_builder &bld, enum opcode opcode,
uint8_t sfid, bool commit_enable, uint8_t bti)
{
assert(opcode == SHADER_OPCODE_INTERLOCK ||
opcode == SHADER_OPCODE_MEMORY_FENCE);
fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
brw_imm_ud(commit_enable),
brw_imm_ud(bti));
fence->sfid = sfid;
return dst;
}
void
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
{
@ -4411,7 +4426,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
case nir_intrinsic_memory_barrier:
case nir_intrinsic_begin_invocation_interlock:
case nir_intrinsic_end_invocation_interlock: {
bool l3_fence, slm_fence, tgm_fence = false;
bool ugm_fence, slm_fence, tgm_fence, urb_fence;
const enum opcode opcode =
instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
@ -4419,14 +4434,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
switch (instr->intrinsic) {
case nir_intrinsic_scoped_barrier: {
nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
l3_fence = modes & (nir_var_shader_out |
nir_var_mem_ssbo |
nir_var_mem_global);
ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
slm_fence = modes & nir_var_mem_shared;
/* NIR currently doesn't have an image mode */
if (devinfo->has_lsc)
tgm_fence = modes & nir_var_mem_ssbo;
tgm_fence = modes & nir_var_mem_ssbo;
urb_fence = modes & nir_var_shader_out;
break;
}
@ -4448,16 +4459,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
* Handling them here will allow the logic for IVB render cache (see
* below) to be reused.
*/
l3_fence = true;
slm_fence = false;
assert(stage == MESA_SHADER_FRAGMENT);
ugm_fence = tgm_fence = true;
slm_fence = urb_fence = false;
break;
default:
l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
ugm_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared &&
instr->intrinsic != nir_intrinsic_memory_barrier_image;
slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier_shared;
tgm_fence = instr->intrinsic == nir_intrinsic_memory_barrier_image;
tgm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier ||
instr->intrinsic == nir_intrinsic_memory_barrier_image;
urb_fence = instr->intrinsic == nir_intrinsic_memory_barrier;
break;
}
@ -4474,95 +4490,99 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
slm_fence && workgroup_size() <= dispatch_width)
slm_fence = false;
/* Prior to Gfx11, there's only L3 fence, so emit that instead. */
if (slm_fence && devinfo->ver < 11) {
slm_fence = false;
l3_fence = true;
}
/* IVB does typed surface access through the render cache, so we need
* to flush it too.
*/
const bool needs_render_fence =
devinfo->verx10 == 70;
/* Be conservative in Gfx11+ and always stall in a fence. Since there
* are two different fences, and shader might want to synchronize
* between them.
*
* TODO: Use scope and visibility information for the barriers from NIR
* to make a better decision on whether we need to stall.
*/
const bool stall = devinfo->ver >= 11 || needs_render_fence ||
instr->intrinsic == nir_intrinsic_end_invocation_interlock;
const bool commit_enable = stall ||
devinfo->ver >= 10; /* HSD ES # 1404612949 */
if (stage != MESA_SHADER_TESS_CTRL)
urb_fence = false;
unsigned fence_regs_count = 0;
fs_reg fence_regs[3] = {};
const fs_builder ubld = bld.group(8, 0);
if (l3_fence) {
fs_inst *fence =
ubld.emit(opcode,
ubld.vgrf(BRW_REGISTER_TYPE_UD),
brw_vec8_grf(0, 0),
brw_imm_ud(commit_enable),
brw_imm_ud(0 /* BTI; ignored for LSC */));
fence->sfid = devinfo->has_lsc ?
GFX12_SFID_UGM :
GFX7_SFID_DATAPORT_DATA_CACHE;
fence_regs[fence_regs_count++] = fence->dst;
if (needs_render_fence) {
fs_inst *render_fence =
ubld.emit(opcode,
ubld.vgrf(BRW_REGISTER_TYPE_UD),
brw_vec8_grf(0, 0),
brw_imm_ud(commit_enable),
brw_imm_ud(/* bti */ 0));
render_fence->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
fence_regs[fence_regs_count++] = render_fence->dst;
if (devinfo->has_lsc) {
assert(devinfo->verx10 >= 125);
if (ugm_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX12_SFID_UGM,
true /* commit_enable */,
0 /* bti; ignored for LSC */);
}
/* Translate l3_fence into untyped and typed fence on XeHP */
if (devinfo->has_lsc && tgm_fence) {
fs_inst *fence =
ubld.emit(opcode,
ubld.vgrf(BRW_REGISTER_TYPE_UD),
brw_vec8_grf(0, 0),
brw_imm_ud(commit_enable),
brw_imm_ud(/* ignored */0));
if (tgm_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX12_SFID_TGM,
true /* commit_enable */,
0 /* bti; ignored for LSC */);
}
fence->sfid = GFX12_SFID_TGM;
fence_regs[fence_regs_count++] = fence->dst;
if (slm_fence) {
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX12_SFID_SLM,
true /* commit_enable */,
0 /* BTI; ignored for LSC */);
}
if (urb_fence) {
unreachable("TODO: Emit a URB barrier message");
}
} else if (devinfo->ver >= 11) {
if (tgm_fence || ugm_fence || urb_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
true /* commit_enable HSD ES # 1404612949 */,
0 /* BTI = 0 means data cache */);
}
if (slm_fence) {
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
true /* commit_enable HSD ES # 1404612949 */,
GFX7_BTI_SLM);
}
} else {
/* Prior to Icelake, they're all lumped into a single cache except on
* Ivy Bridge and Bay Trail where typed messages actually go through
* the render cache. There, we need both fences because we may
* access storage images as either typed or untyped.
*/
const bool render_fence = tgm_fence && devinfo->verx10 == 70;
const bool commit_enable = render_fence ||
instr->intrinsic == nir_intrinsic_end_invocation_interlock;
if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
commit_enable, 0 /* BTI */);
}
if (render_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE,
commit_enable, /* bti */ 0);
}
}
if (slm_fence) {
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
fs_inst *fence =
ubld.emit(opcode,
ubld.vgrf(BRW_REGISTER_TYPE_UD),
brw_vec8_grf(0, 0),
brw_imm_ud(commit_enable),
brw_imm_ud(GFX7_BTI_SLM /* ignored for LSC */));
if (devinfo->has_lsc)
fence->sfid = GFX12_SFID_SLM;
else
fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
fence_regs[fence_regs_count++] = fence->dst;
}
assert(fence_regs_count <= 3);
if (stall || fence_regs_count == 0) {
/* There are three cases where we want to insert a stall:
*
* 1. If we're a nir_intrinsic_end_invocation_interlock. This is
* required to ensure that the shader EOT doesn't happen until
* after the fence returns. Otherwise, we might end up with the
* next shader invocation for that pixel not respecting our fence
* because it may happen on a different HW thread.
*
* 2. If we have multiple fences. This is required to ensure that
* they all complete and nothing gets weirdly out-of-order.
*
* 3. If we have no fences. In this case, we need at least a
* scheduling barrier to keep the compiler from moving things
* around in an invalid way.
*/
if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
fence_regs_count != 1) {
ubld.exec_all().group(1, 0).emit(
FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
fence_regs, fence_regs_count);