intel/fs: Lower URB messages to SEND

Before rebasing on top of Ken's split-SEND optimization (see !17018),
this commit just caused some scheduling changes in various tessellation
and geometry shaders.  These changes were caused by the addition of real
latency information for the URB messages.

With the addition of the split-SEND optimization, the changes
are... staggering.  All of the shaders helped for spills and fills are
vertex shaders from Batman Arkham Origins.  What surprises me is that
these shaders account for such a high percentage of the spills and fills
in fossil-db.  85%?!?

v2: Use FIXED_GRF instead of BRW_GENERAL_REGISTER_FILE in an assertion.
Suggested by Ken.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20013625 -> 19954020 (-0.30%)
instructions in affected programs: 4007157 -> 3947552 (-1.49%)
helped: 31161
HURT: 0
helped stats (abs) min: 1 max: 400 x̄: 1.91 x̃: 2
helped stats (rel) min: 0.08% max: 59.70% x̄: 2.20% x̃: 1.83%
95% mean confidence interval for instructions value: -1.97 -1.86
95% mean confidence interval for instructions %-change: -2.22% -2.18%
Instructions are helped.

total cycles in shared programs: 859337569 -> 858636788 (-0.08%)
cycles in affected programs: 74168298 -> 73467517 (-0.94%)
helped: 13812
HURT: 16846
helped stats (abs) min: 1 max: 291078 x̄: 82.83 x̃: 4
helped stats (rel) min: <.01% max: 37.09% x̄: 3.47% x̃: 2.02%
HURT stats (abs)   min: 1 max: 1543 x̄: 26.31 x̃: 14
HURT stats (rel)   min: <.01% max: 77.97% x̄: 4.11% x̃: 2.58%
95% mean confidence interval for cycles value: -55.10 9.39
95% mean confidence interval for cycles %-change: 0.62% 0.77%
Inconclusive result (value mean confidence interval includes 0).

Broadwell
total cycles in shared programs: 904844939 -> 904832320 (<.01%)
cycles in affected programs: 525360 -> 512741 (-2.40%)
helped: 215
HURT: 4
helped stats (abs) min: 4 max: 1018 x̄: 60.16 x̃: 39
helped stats (rel) min: 0.14% max: 15.85% x̄: 2.16% x̃: 2.04%
HURT stats (abs)   min: 79 max: 79 x̄: 79.00 x̃: 79
HURT stats (rel)   min: 1.31% max: 1.57% x̄: 1.43% x̃: 1.43%
95% mean confidence interval for cycles value: -75.02 -40.22
95% mean confidence interval for cycles %-change: -2.37% -1.81%
Cycles are helped.

No shader-db changes on any older Intel platforms.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 142622800 -> 141461114 (-0.8%)
Instructions helped: 197186

Cycles in all programs: 9101223846 -> 9099440025 (-0.0%)
Cycles helped: 37963
Cycles hurt: 151233

Spills in all programs: 98829 -> 13695 (-86.1%)
Spills helped: 2159

Fills in all programs: 128142 -> 18400 (-85.6%)
Fills helped: 2159

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17379>
This commit is contained in:
Ian Romanick 2022-06-27 15:34:01 -07:00 committed by Marge Bot
parent a477587b4a
commit bdc7668008
3 changed files with 74 additions and 10 deletions

View File

@ -1152,6 +1152,10 @@ namespace {
return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
10 /* XXX */, 0, 0, 0, 0, 0);
case BRW_SFID_URB:
return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
default:
abort();
}

View File

@ -31,15 +31,71 @@
using namespace brw;
static void
lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst,
bool per_slot_present)
{
inst->opcode = op;
const intel_device_info *devinfo = bld.shader->devinfo;
assert(inst->size_written % REG_SIZE == 0);
assert(inst->src[0].type == BRW_REGISTER_TYPE_UD);
assert(inst->src[0].file == FIXED_GRF || inst->src[0].file == VGRF);
inst->opcode = SHADER_OPCODE_SEND;
inst->header_size = 1;
inst->sfid = BRW_SFID_URB;
inst->desc = brw_urb_desc(devinfo,
GFX8_URB_OPCODE_SIMD8_READ,
per_slot_present,
false,
inst->offset);
inst->ex_desc = 0;
inst->ex_mlen = 0;
inst->send_is_volatile = true;
fs_reg tmp = inst->src[0];
inst->resize_sources(4);
inst->src[0] = brw_imm_ud(0); /* desc */
inst->src[1] = brw_imm_ud(0); /* ex_desc */
inst->src[2] = tmp;
inst->src[3] = brw_null_reg();
}
static void
lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
bool per_slot_present, bool channel_mask_present)
{
inst->opcode = op;
const intel_device_info *devinfo = bld.shader->devinfo;
assert(inst->header_size == 0);
inst->opcode = SHADER_OPCODE_SEND;
inst->header_size = 1;
inst->dst = brw_null_reg();
inst->sfid = BRW_SFID_URB;
inst->desc = brw_urb_desc(devinfo,
GFX8_URB_OPCODE_SIMD8_WRITE,
per_slot_present,
channel_mask_present,
inst->offset);
inst->ex_desc = 0;
inst->ex_mlen = 0;
inst->send_has_side_effects = true;
fs_reg tmp = inst->src[0];
inst->resize_sources(4);
inst->src[0] = brw_imm_ud(0); /* desc */
inst->src[1] = brw_imm_ud(0); /* ex_desc */
inst->src[2] = tmp;
inst->src[3] = brw_null_reg();
}
static void
@ -2642,23 +2698,23 @@ fs_visitor::lower_logical_sends()
break;
case SHADER_OPCODE_URB_READ_LOGICAL:
lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8);
lower_urb_read_logical_send(ibld, inst, false);
break;
case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL:
lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT);
lower_urb_read_logical_send(ibld, inst, true);
break;
case SHADER_OPCODE_URB_WRITE_LOGICAL:
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8);
lower_urb_write_logical_send(ibld, inst, false, false);
break;
case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT);
lower_urb_write_logical_send(ibld, inst, true, false);
break;
case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED);
lower_urb_write_logical_send(ibld, inst, false, true);
break;
case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT);
lower_urb_write_logical_send(ibld, inst, true, true);
break;
default:

View File

@ -575,6 +575,10 @@ schedule_node::set_latency_gfx7(bool is_haswell)
latency = 200;
break;
case BRW_SFID_URB:
latency = 200;
break;
default:
unreachable("Unknown SFID");
}