From 3f063334fc30df17015253c9308b89f41cddc9ed Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Thu, 18 Feb 2021 23:26:57 -0800 Subject: [PATCH] intel/fs: Represent SWSB in-order dependency addresses as vectors. This extends the current ordered_address instruction counter to a vector with one component per asynchronous ALU pipeline, allowing us to track the last instruction that accessed a register separately for each ALU pipeline of the XeHP EU, making it straightforward to infer the right cross-pipeline synchronization annotations. Reviewed-by: Lionel Landwerlin v2: Make unit tests happy (with ubsan as run by GitLab automation). Part-of: --- src/intel/compiler/brw_fs_scoreboard.cpp | 118 +++++++++++++++++------ 1 file changed, 90 insertions(+), 28 deletions(-) diff --git a/src/intel/compiler/brw_fs_scoreboard.cpp b/src/intel/compiler/brw_fs_scoreboard.cpp index 3b1f6deac60..f3a64261cf5 100644 --- a/src/intel/compiler/brw_fs_scoreboard.cpp +++ b/src/intel/compiler/brw_fs_scoreboard.cpp @@ -130,13 +130,21 @@ namespace { } /** - * Number of in-order hardware instructions contained in this IR - * instruction. This determines the increment applied to the RegDist - * counter calculated for any ordered dependency that crosses this + * Index of the \p p pipeline counter in the ordered_address vector defined + * below. + */ +#define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \ + (abort(), ~0u)) + + /** + * Number of in-order hardware instructions for pipeline index \p contained + * in this IR instruction. This determines the increment applied to the + * RegDist counter calculated for any ordered dependency that crosses this * instruction. */ unsigned - ordered_unit(const fs_inst *inst) + ordered_unit(const struct gen_device_info *devinfo, const fs_inst *inst, + unsigned p) { switch (inst->opcode) { case BRW_OPCODE_SYNC: @@ -156,7 +164,11 @@ namespace { * (again) don't use virtual instructions if you want optimal * scheduling. */ - return is_unordered(inst) ? 0 : 1; + if (!is_unordered(inst) && (p == IDX(inferred_exec_pipe(devinfo, inst)) || + p == IDX(TGL_PIPE_ALL))) + return 1; + else + return 0; } } @@ -164,8 +176,36 @@ namespace { * Type for an instruction counter that increments for in-order * instructions only, arbitrarily denoted 'jp' throughout this lowering * pass in order to distinguish it from the regular instruction counter. + * This is represented as a vector with an independent counter for each + * asynchronous ALU pipeline in the EU. */ - typedef int ordered_address; + struct ordered_address { + /** + * Construct the ordered address of a dependency known to execute on a + * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL + * is provided), in which case the vector counter will be initialized + * with all components equal to INT_MIN (always satisfied) except for + * component IDX(p). + */ + ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) { + for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) + jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ? + INT_MIN : jp0); + } + + int jp[IDX(TGL_PIPE_ALL)]; + + friend bool + operator==(const ordered_address &jp0, const ordered_address &jp1) + { + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) { + if (jp0.jp[p] != jp1.jp[p]) + return false; + } + + return true; + } + }; /** * Return the number of instructions in the program. @@ -184,12 +224,13 @@ namespace { ordered_inst_addresses(const fs_visitor *shader) { ordered_address *jps = new ordered_address[num_instructions(shader)]; - ordered_address jp = 0; + ordered_address jp(TGL_PIPE_ALL, 0); unsigned ip = 0; foreach_block_and_inst(block, fs_inst, inst, shader->cfg) { jps[ip] = jp; - jp += ordered_unit(inst); + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + jp.jp[p] += ordered_unit(shader->devinfo, inst, p); ip++; } @@ -347,7 +388,7 @@ namespace { /** * No dependency information. */ - dependency() : ordered(TGL_REGDIST_NULL), jp(INT_MIN), + dependency() : ordered(TGL_REGDIST_NULL), jp(), unordered(TGL_SBID_NULL), id(0), exec_all(false) {} @@ -355,7 +396,8 @@ namespace { * Construct a dependency on the in-order instruction with the provided * ordered_address instruction counter. */ - dependency(tgl_regdist_mode mode, ordered_address jp, bool exec_all) : + dependency(tgl_regdist_mode mode, const ordered_address &jp, + bool exec_all) : ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0), exec_all(exec_all) {} @@ -364,7 +406,7 @@ namespace { * specified synchronization token. */ dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) : - ordered(TGL_REGDIST_NULL), jp(INT_MIN), unordered(mode), id(id), + ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id), exec_all(exec_all) {} /** @@ -432,7 +474,8 @@ namespace { } }; - const dependency dependency::done = dependency(TGL_REGDIST_SRC, INT_MIN, false); + const dependency dependency::done = + dependency(TGL_REGDIST_SRC, ordered_address(), false); /** * Return whether \p dep contains any dependency information. @@ -458,7 +501,8 @@ namespace { if (dep0.ordered || dep1.ordered) { dep.ordered = dep0.ordered | dep1.ordered; - dep.jp = MAX2(dep0.jp, dep1.jp); + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]); } if (dep0.unordered || dep1.unordered) { @@ -492,10 +536,14 @@ namespace { * the end of the origin block. */ dependency - transport(dependency dep, int delta) + transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)]) { - if (dep.ordered && dep.jp > INT_MIN) - dep.jp += delta; + if (dep.ordered) { + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) { + if (dep.jp.jp[p] > INT_MIN) + dep.jp.jp[p] += delta[p]; + } + } return dep; } @@ -599,7 +647,7 @@ namespace { * object. \sa transport(). */ friend scoreboard - transport(const scoreboard &sb0, int delta) + transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)]) { scoreboard sb; @@ -736,7 +784,9 @@ namespace { continue; if (dep.ordered && deps[i].ordered) { - deps[i].jp = MAX2(deps[i].jp, dep.jp); + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]); + deps[i].ordered |= dep.ordered; deps[i].exec_all |= dep.exec_all; dep.ordered = TGL_REGDIST_NULL; @@ -770,11 +820,13 @@ namespace { for (unsigned i = 0; i < deps.size(); i++) { if (deps[i].ordered && exec_all >= deps[i].exec_all) { - const unsigned dist = jp - deps[i].jp; - const unsigned max_dist = 10; - assert(jp > deps[i].jp); - if (dist <= max_dist) - min_dist = MIN3(min_dist, dist, 7); + for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) { + const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]); + const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10); + assert(jp.jp[q] > deps[i].jp.jp[q]); + if (dist <= max_dist) + min_dist = MIN3(min_dist, dist, 7); + } } } @@ -861,6 +913,10 @@ namespace { const fs_inst *inst, unsigned ip, scoreboard &sb) { const bool exec_all = inst->force_writemask_all; + const struct gen_device_info *devinfo = shader->devinfo; + const tgl_pipe p = inferred_exec_pipe(devinfo, inst); + const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) : + ordered_address(); /* Track any source registers that may be fetched asynchronously by this * instruction, otherwise clear the dependency in order to avoid @@ -870,7 +926,8 @@ namespace { const dependency rd_dep = (inst->is_payload(i) || inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) : - ordered_unit(inst) ? dependency(TGL_REGDIST_SRC, jps[ip], exec_all) : + ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ? + dependency(TGL_REGDIST_SRC, jp, exec_all) : dependency::done; for (unsigned j = 0; j < regs_read(inst, i); j++) @@ -887,7 +944,8 @@ namespace { /* Track any destination registers of this instruction. */ const dependency wr_dep = is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) : - ordered_unit(inst) ? dependency(TGL_REGDIST_DST, jps[ip], exec_all) : + ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ? + dependency(TGL_REGDIST_DST, jp, exec_all) : dependency(); if (is_valid(wr_dep) && inst->dst.file != BAD_FILE && @@ -942,9 +1000,13 @@ namespace { foreach_list_typed(bblock_link, child_link, link, &block->children) { scoreboard &in_sb = in_sbs[child_link->block->num]; - const int delta = - jps[child_link->block->start_ip] - jps[block->end_ip] - - ordered_unit(static_cast(block->end())); + int delta[IDX(TGL_PIPE_ALL)]; + + for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) + delta[p] = jps[child_link->block->start_ip].jp[p] + - jps[block->end_ip].jp[p] + - ordered_unit(shader->devinfo, + static_cast(block->end()), p); in_sb = merge(eq, in_sb, transport(sb, delta)); }