intel/compiler: split is_partial_write() into two variants
This function is used in two different scenarios that for 32-bit instructions are the same, but for 16-bit instructions are not. One scenario is that in which we are working at a SIMD8 register level and we need to know if a register is fully defined or written. This is useful, for example, in the context of liveness analysis or register allocation, where we work with units of registers. The other scenario is that in which we want to know if an instruction is writing a full scalar component or just some subset of it. This is useful, for example, in the context of some optimization passes like copy propagation. For 32-bit instructions (or larger), a SIMD8 dispatch will always write at least a full SIMD8 register (32B) if the write is not partial. The function is_partial_write() checks this to determine if we have a partial write. However, when we deal with 16-bit instructions, that logic disables some optimizations that should be safe. For example, a SIMD8 16-bit MOV will only update half of a SIMD register, but it is still a complete write of the variable for a SIMD8 dispatch, so we should not prevent copy propagation in this scenario because we don't write all 32 bytes in the SIMD register or because the write starts at offset 16B (wehere we pack components Y or W of 16-bit vectors). This is a problem for SIMD8 executions (VS, TCS, TES, GS) of 16-bit instructions, which lose a number of optimizations because of this, most important of which is copy-propagation. This patch splits is_partial_write() into is_partial_reg_write(), which represents the current is_partial_write(), useful for things like liveness analysis, and is_partial_var_write(), which considers the dispatch size to check if we are writing a full variable (rather than a full register) to decide if the write is partial or not, which is what we really want in many optimization passes. Then the patch goes on and rewrites all uses of is_partial_write() to use one or the other version. Specifically, we use is_partial_var_write() in the following places: copy propagation, cmod propagation, common subexpression elimination, saturate propagation and sel peephole. Notice that the semantics of is_partial_var_write() exactly match the current implementation of is_partial_write() for anything that is 32-bit or larger, so no changes are expected for 32-bit instructions. Tested against ~5000 tests involving 16-bit instructions in CTS produced the following changes in instruction counts: Patched | Master | % | ================================================ SIMD8 | 621,900 | 706,721 | -12.00% | ================================================ SIMD16 | 93,252 | 93,252 | 0.00% | ================================================ As expected, the change only affects SIMD8 dispatches. Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
This commit is contained in:
parent
0986199b31
commit
40b3abb4d1
|
@ -734,14 +734,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
|||
* it.
|
||||
*/
|
||||
bool
|
||||
fs_inst::is_partial_write() const
|
||||
fs_inst::is_partial_reg_write() const
|
||||
{
|
||||
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
|
||||
(this->exec_size * type_sz(this->dst.type)) < 32 ||
|
||||
!this->dst.is_contiguous() ||
|
||||
(this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
|
||||
this->dst.offset % REG_SIZE != 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the instruction has a flag that means it won't
|
||||
* update an entire variable for the given dispatch width.
|
||||
*
|
||||
* This is only different from is_partial_reg_write() for SIMD8
|
||||
* dispatches of 16-bit (or smaller) instructions.
|
||||
*/
|
||||
bool
|
||||
fs_inst::is_partial_var_write(uint32_t dispatch_width) const
|
||||
{
|
||||
const uint32_t type_size = type_sz(this->dst.type);
|
||||
uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
|
||||
|
||||
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
|
||||
!this->dst.is_contiguous() ||
|
||||
(this->exec_size * type_sz(this->dst.type)) < var_size ||
|
||||
this->dst.offset % var_size != 0);
|
||||
}
|
||||
|
||||
unsigned
|
||||
fs_inst::components_read(unsigned i) const
|
||||
{
|
||||
|
@ -2956,7 +2975,7 @@ fs_visitor::opt_register_renaming()
|
|||
if (depth == 0 &&
|
||||
inst->dst.file == VGRF &&
|
||||
alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
|
||||
!inst->is_partial_write()) {
|
||||
!inst->is_partial_reg_write()) {
|
||||
if (remap[dst] == ~0u) {
|
||||
remap[dst] = dst;
|
||||
} else {
|
||||
|
@ -3160,7 +3179,7 @@ fs_visitor::compute_to_mrf()
|
|||
next_ip++;
|
||||
|
||||
if (inst->opcode != BRW_OPCODE_MOV ||
|
||||
inst->is_partial_write() ||
|
||||
inst->is_partial_reg_write() ||
|
||||
inst->dst.file != MRF || inst->src[0].file != VGRF ||
|
||||
inst->dst.type != inst->src[0].type ||
|
||||
inst->src[0].abs || inst->src[0].negate ||
|
||||
|
@ -3193,7 +3212,7 @@ fs_visitor::compute_to_mrf()
|
|||
* that writes that reg, but it would require smarter
|
||||
* tracking.
|
||||
*/
|
||||
if (scan_inst->is_partial_write())
|
||||
if (scan_inst->is_partial_reg_write())
|
||||
break;
|
||||
|
||||
/* Handling things not fully contained in the source of the copy
|
||||
|
@ -3511,7 +3530,7 @@ fs_visitor::remove_duplicate_mrf_writes()
|
|||
if (inst->opcode == BRW_OPCODE_MOV &&
|
||||
inst->dst.file == MRF &&
|
||||
inst->src[0].file != ARF &&
|
||||
!inst->is_partial_write()) {
|
||||
!inst->is_partial_reg_write()) {
|
||||
last_mrf_move[inst->dst.nr] = inst;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,13 +50,13 @@
|
|||
|
||||
static bool
|
||||
cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
|
||||
fs_inst *inst)
|
||||
fs_inst *inst, unsigned dispatch_width)
|
||||
{
|
||||
bool read_flag = false;
|
||||
|
||||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (scan_inst->opcode == BRW_OPCODE_ADD &&
|
||||
!scan_inst->is_partial_write() &&
|
||||
!scan_inst->is_partial_var_write(dispatch_width) &&
|
||||
scan_inst->exec_size == inst->exec_size) {
|
||||
bool negate;
|
||||
|
||||
|
@ -126,7 +126,7 @@ cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
|
|||
*/
|
||||
static bool
|
||||
cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
|
||||
fs_inst *inst)
|
||||
fs_inst *inst, unsigned dispatch_width)
|
||||
{
|
||||
const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
|
||||
bool read_flag = false;
|
||||
|
@ -141,7 +141,7 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
|
|||
scan_inst->opcode != BRW_OPCODE_AND)
|
||||
break;
|
||||
|
||||
if (scan_inst->is_partial_write() ||
|
||||
if (scan_inst->is_partial_var_write(dispatch_width) ||
|
||||
scan_inst->dst.offset != inst->src[0].offset ||
|
||||
scan_inst->exec_size != inst->exec_size)
|
||||
break;
|
||||
|
@ -166,7 +166,9 @@ cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
|
|||
}
|
||||
|
||||
static bool
|
||||
opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
|
||||
opt_cmod_propagation_local(const gen_device_info *devinfo,
|
||||
bblock_t *block,
|
||||
unsigned dispatch_width)
|
||||
{
|
||||
bool progress = false;
|
||||
int ip = block->end_ip + 1;
|
||||
|
@ -219,14 +221,14 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
|
|||
*/
|
||||
if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
|
||||
if (brw_reg_type_is_floating_point(inst->src[0].type) &&
|
||||
cmod_propagate_cmp_to_add(devinfo, block, inst))
|
||||
cmod_propagate_cmp_to_add(devinfo, block, inst, dispatch_width))
|
||||
progress = true;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (inst->opcode == BRW_OPCODE_NOT) {
|
||||
progress = cmod_propagate_not(devinfo, block, inst) || progress;
|
||||
progress = cmod_propagate_not(devinfo, block, inst, dispatch_width) || progress;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -234,7 +236,7 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
|
|||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
if (scan_inst->is_partial_write() ||
|
||||
if (scan_inst->is_partial_var_write(dispatch_width) ||
|
||||
scan_inst->dst.offset != inst->src[0].offset ||
|
||||
scan_inst->exec_size != inst->exec_size)
|
||||
break;
|
||||
|
@ -364,7 +366,7 @@ fs_visitor::opt_cmod_propagation()
|
|||
bool progress = false;
|
||||
|
||||
foreach_block_reverse(block, cfg) {
|
||||
progress = opt_cmod_propagation_local(devinfo, block) || progress;
|
||||
progress = opt_cmod_propagation_local(devinfo, block, dispatch_width) || progress;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
|
|
|
@ -515,7 +515,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
|
|||
/* Compute the first component of the copy that the instruction is
|
||||
* reading, and the base byte offset within that component.
|
||||
*/
|
||||
assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1);
|
||||
assert(entry->dst.stride == 1);
|
||||
const unsigned component = rel_offset / type_sz(entry->dst.type);
|
||||
const unsigned suboffset = rel_offset % type_sz(entry->dst.type);
|
||||
|
||||
|
@ -767,7 +767,7 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
|
|||
}
|
||||
|
||||
static bool
|
||||
can_propagate_from(fs_inst *inst)
|
||||
can_propagate_from(fs_inst *inst, unsigned dispatch_width)
|
||||
{
|
||||
return (inst->opcode == BRW_OPCODE_MOV &&
|
||||
inst->dst.file == VGRF &&
|
||||
|
@ -778,7 +778,7 @@ can_propagate_from(fs_inst *inst)
|
|||
inst->src[0].file == UNIFORM ||
|
||||
inst->src[0].file == IMM) &&
|
||||
inst->src[0].type == inst->dst.type &&
|
||||
!inst->is_partial_write());
|
||||
!inst->is_partial_var_write(dispatch_width));
|
||||
}
|
||||
|
||||
/* Walks a basic block and does copy propagation on it using the acp
|
||||
|
@ -830,7 +830,7 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
|
|||
/* If this instruction's source could potentially be folded into the
|
||||
* operand of another instruction, add it to the ACP.
|
||||
*/
|
||||
if (can_propagate_from(inst)) {
|
||||
if (can_propagate_from(inst, dispatch_width)) {
|
||||
acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
|
||||
entry->dst = inst->dst;
|
||||
entry->src = inst->src[0];
|
||||
|
|
|
@ -252,7 +252,8 @@ fs_visitor::opt_cse_local(bblock_t *block)
|
|||
int ip = block->start_ip;
|
||||
foreach_inst_in_block(fs_inst, inst, block) {
|
||||
/* Skip some cases. */
|
||||
if (is_expression(this, inst) && !inst->is_partial_write() &&
|
||||
if (is_expression(this, inst) &&
|
||||
!inst->is_partial_var_write(dispatch_width) &&
|
||||
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
|
||||
inst->dst.is_null()))
|
||||
{
|
||||
|
|
|
@ -107,7 +107,7 @@ fs_visitor::dead_code_eliminate()
|
|||
}
|
||||
|
||||
if (inst->dst.file == VGRF) {
|
||||
if (!inst->is_partial_write()) {
|
||||
if (!inst->is_partial_reg_write()) {
|
||||
int var = live_intervals->var_from_reg(inst->dst);
|
||||
for (unsigned i = 0; i < regs_written(inst); i++) {
|
||||
BITSET_CLEAR(live, var + i);
|
||||
|
|
|
@ -84,7 +84,7 @@ fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
|
|||
* screens off previous updates of that variable (VGRF channel).
|
||||
*/
|
||||
if (inst->dst.file == VGRF) {
|
||||
if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
|
||||
if (!inst->is_partial_reg_write() && !BITSET_TEST(bd->use, var))
|
||||
BITSET_SET(bd->def, var);
|
||||
|
||||
BITSET_SET(bd->defout, var);
|
||||
|
|
|
@ -1054,7 +1054,7 @@ fs_visitor::spill_reg(unsigned spill_reg)
|
|||
* write, there should be no need for the unspill since the
|
||||
* instruction will be overwriting the whole destination in any case.
|
||||
*/
|
||||
if (inst->is_partial_write() ||
|
||||
if (inst->is_partial_reg_write() ||
|
||||
(!inst->force_writemask_all && !per_channel))
|
||||
emit_unspill(ubld, spill_src, subset_spill_offset,
|
||||
regs_written(inst));
|
||||
|
|
|
@ -70,7 +70,7 @@ is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
|
|||
{
|
||||
if ((inst->opcode != BRW_OPCODE_MOV &&
|
||||
inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
|
||||
inst->is_partial_write() ||
|
||||
inst->is_partial_reg_write() ||
|
||||
inst->saturate ||
|
||||
inst->src[0].file != VGRF ||
|
||||
inst->src[0].negate ||
|
||||
|
|
|
@ -43,7 +43,8 @@
|
|||
*/
|
||||
|
||||
static bool
|
||||
opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
|
||||
opt_saturate_propagation_local(fs_visitor *v, bblock_t *block,
|
||||
unsigned dispatch_width)
|
||||
{
|
||||
bool progress = false;
|
||||
int ip = block->end_ip + 1;
|
||||
|
@ -66,7 +67,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
|
|||
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
||||
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
||||
inst->src[0], inst->size_read(0))) {
|
||||
if (scan_inst->is_partial_write() ||
|
||||
if (scan_inst->is_partial_var_write(dispatch_width) ||
|
||||
(scan_inst->dst.type != inst->dst.type &&
|
||||
!scan_inst->can_change_types()))
|
||||
break;
|
||||
|
@ -153,7 +154,7 @@ fs_visitor::opt_saturate_propagation()
|
|||
calculate_live_intervals();
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
progress = opt_saturate_propagation_local(this, block) || progress;
|
||||
progress = opt_saturate_propagation_local(this, block, dispatch_width) || progress;
|
||||
}
|
||||
|
||||
/* Live intervals are still valid. */
|
||||
|
|
|
@ -167,8 +167,8 @@ fs_visitor::opt_peephole_sel()
|
|||
then_mov[i]->exec_size != else_mov[i]->exec_size ||
|
||||
then_mov[i]->group != else_mov[i]->group ||
|
||||
then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
|
||||
then_mov[i]->is_partial_write() ||
|
||||
else_mov[i]->is_partial_write() ||
|
||||
then_mov[i]->is_partial_var_write(dispatch_width) ||
|
||||
else_mov[i]->is_partial_var_write(dispatch_width) ||
|
||||
then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
|
||||
else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
|
||||
movs = i;
|
||||
|
|
|
@ -348,7 +348,8 @@ public:
|
|||
void resize_sources(uint8_t num_sources);
|
||||
|
||||
bool is_send_from_grf() const;
|
||||
bool is_partial_write() const;
|
||||
bool is_partial_reg_write() const;
|
||||
bool is_partial_var_write(unsigned dispatch_width) const;
|
||||
bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
|
||||
unsigned components_read(unsigned i) const;
|
||||
unsigned size_read(int arg) const;
|
||||
|
|
Loading…
Reference in New Issue