intel/fs: Lower URB messages to SEND
Before rebasing on top of Ken's split-SEND optimization (see !17018), this commit just caused some scheduling changes in various tessellation and geometry shaders. These changes were caused by the addition of real latency information for the URB messages. With the addition of the split-SEND optimization, the changes are... staggering. All of the shaders helped for spills and fills are vertex shaders from Batman Arkham Origins. What surprises me is that these shaders account for such a high percentage of the spills and fills in fossil-db. 85%?!? v2: Use FIXED_GRF instead of BRW_GENERAL_REGISTER_FILE in an assertion. Suggested by Ken. Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) total instructions in shared programs: 20013625 -> 19954020 (-0.30%) instructions in affected programs: 4007157 -> 3947552 (-1.49%) helped: 31161 HURT: 0 helped stats (abs) min: 1 max: 400 x̄: 1.91 x̃: 2 helped stats (rel) min: 0.08% max: 59.70% x̄: 2.20% x̃: 1.83% 95% mean confidence interval for instructions value: -1.97 -1.86 95% mean confidence interval for instructions %-change: -2.22% -2.18% Instructions are helped. total cycles in shared programs: 859337569 -> 858636788 (-0.08%) cycles in affected programs: 74168298 -> 73467517 (-0.94%) helped: 13812 HURT: 16846 helped stats (abs) min: 1 max: 291078 x̄: 82.83 x̃: 4 helped stats (rel) min: <.01% max: 37.09% x̄: 3.47% x̃: 2.02% HURT stats (abs) min: 1 max: 1543 x̄: 26.31 x̃: 14 HURT stats (rel) min: <.01% max: 77.97% x̄: 4.11% x̃: 2.58% 95% mean confidence interval for cycles value: -55.10 9.39 95% mean confidence interval for cycles %-change: 0.62% 0.77% Inconclusive result (value mean confidence interval includes 0). Broadwell total cycles in shared programs: 904844939 -> 904832320 (<.01%) cycles in affected programs: 525360 -> 512741 (-2.40%) helped: 215 HURT: 4 helped stats (abs) min: 4 max: 1018 x̄: 60.16 x̃: 39 helped stats (rel) min: 0.14% max: 15.85% x̄: 2.16% x̃: 2.04% HURT stats (abs) min: 79 max: 79 x̄: 79.00 x̃: 79 HURT stats (rel) min: 1.31% max: 1.57% x̄: 1.43% x̃: 1.43% 95% mean confidence interval for cycles value: -75.02 -40.22 95% mean confidence interval for cycles %-change: -2.37% -1.81% Cycles are helped. No shader-db changes on any older Intel platforms. Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown) Instructions in all programs: 142622800 -> 141461114 (-0.8%) Instructions helped: 197186 Cycles in all programs: 9101223846 -> 9099440025 (-0.0%) Cycles helped: 37963 Cycles hurt: 151233 Spills in all programs: 98829 -> 13695 (-86.1%) Spills helped: 2159 Fills in all programs: 128142 -> 18400 (-85.6%) Fills helped: 2159 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17379>
This commit is contained in:
parent
a477587b4a
commit
bdc7668008
|
@ -1152,6 +1152,10 @@ namespace {
|
|||
return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
|
||||
10 /* XXX */, 0, 0, 0, 0, 0);
|
||||
|
||||
case BRW_SFID_URB:
|
||||
return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
|
||||
32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
|
||||
|
||||
default:
|
||||
abort();
|
||||
}
|
||||
|
|
|
@ -31,15 +31,71 @@
|
|||
using namespace brw;
|
||||
|
||||
static void
|
||||
lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
|
||||
lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst,
|
||||
bool per_slot_present)
|
||||
{
|
||||
inst->opcode = op;
|
||||
const intel_device_info *devinfo = bld.shader->devinfo;
|
||||
|
||||
|
||||
assert(inst->size_written % REG_SIZE == 0);
|
||||
assert(inst->src[0].type == BRW_REGISTER_TYPE_UD);
|
||||
assert(inst->src[0].file == FIXED_GRF || inst->src[0].file == VGRF);
|
||||
|
||||
inst->opcode = SHADER_OPCODE_SEND;
|
||||
inst->header_size = 1;
|
||||
|
||||
inst->sfid = BRW_SFID_URB;
|
||||
inst->desc = brw_urb_desc(devinfo,
|
||||
GFX8_URB_OPCODE_SIMD8_READ,
|
||||
per_slot_present,
|
||||
false,
|
||||
inst->offset);
|
||||
|
||||
inst->ex_desc = 0;
|
||||
inst->ex_mlen = 0;
|
||||
inst->send_is_volatile = true;
|
||||
|
||||
fs_reg tmp = inst->src[0];
|
||||
|
||||
inst->resize_sources(4);
|
||||
|
||||
inst->src[0] = brw_imm_ud(0); /* desc */
|
||||
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
||||
inst->src[2] = tmp;
|
||||
inst->src[3] = brw_null_reg();
|
||||
}
|
||||
|
||||
static void
|
||||
lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
|
||||
lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
|
||||
bool per_slot_present, bool channel_mask_present)
|
||||
{
|
||||
inst->opcode = op;
|
||||
const intel_device_info *devinfo = bld.shader->devinfo;
|
||||
|
||||
assert(inst->header_size == 0);
|
||||
|
||||
inst->opcode = SHADER_OPCODE_SEND;
|
||||
inst->header_size = 1;
|
||||
inst->dst = brw_null_reg();
|
||||
|
||||
inst->sfid = BRW_SFID_URB;
|
||||
inst->desc = brw_urb_desc(devinfo,
|
||||
GFX8_URB_OPCODE_SIMD8_WRITE,
|
||||
per_slot_present,
|
||||
channel_mask_present,
|
||||
inst->offset);
|
||||
|
||||
inst->ex_desc = 0;
|
||||
inst->ex_mlen = 0;
|
||||
inst->send_has_side_effects = true;
|
||||
|
||||
fs_reg tmp = inst->src[0];
|
||||
|
||||
inst->resize_sources(4);
|
||||
|
||||
inst->src[0] = brw_imm_ud(0); /* desc */
|
||||
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
||||
inst->src[2] = tmp;
|
||||
inst->src[3] = brw_null_reg();
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -2642,23 +2698,23 @@ fs_visitor::lower_logical_sends()
|
|||
break;
|
||||
|
||||
case SHADER_OPCODE_URB_READ_LOGICAL:
|
||||
lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8);
|
||||
lower_urb_read_logical_send(ibld, inst, false);
|
||||
break;
|
||||
case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL:
|
||||
lower_urb_read_logical_send(ibld, inst, SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT);
|
||||
lower_urb_read_logical_send(ibld, inst, true);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
||||
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8);
|
||||
lower_urb_write_logical_send(ibld, inst, false, false);
|
||||
break;
|
||||
case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
|
||||
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT);
|
||||
lower_urb_write_logical_send(ibld, inst, true, false);
|
||||
break;
|
||||
case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
|
||||
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED);
|
||||
lower_urb_write_logical_send(ibld, inst, false, true);
|
||||
break;
|
||||
case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
|
||||
lower_urb_write_logical_send(ibld, inst, SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT);
|
||||
lower_urb_write_logical_send(ibld, inst, true, true);
|
||||
break;
|
||||
|
||||
default:
|
||||
|
|
|
@ -575,6 +575,10 @@ schedule_node::set_latency_gfx7(bool is_haswell)
|
|||
latency = 200;
|
||||
break;
|
||||
|
||||
case BRW_SFID_URB:
|
||||
latency = 200;
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Unknown SFID");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue