intel/fs/copy_prop: check stride constraints with actual final type

In some cases we will change the type of the destination register of
an instruction. This is the type we should use to verify that we're
allow to do the replacement.

Otherwise we can hit restrictions on CHV and upcoming Xe-Hp for
instance where the copy propagation transforms this :

send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD
mov(16) vgrf11:UW, vgrf10<2>:UW
mov(16) vgrf12:UW, vgrf10+0.2<2>:UW
mov(16) vgrf15:HF, |vgrf11|:HF
mov(16) vgrf16:HF, |vgrf12|:HF
mov(8) vgrf41<2>:UW, vgrf15+0.0:UW group0
mov(8) vgrf42<2>:UW, vgrf15+0.16:UW group8
mov(8) vgrf45<2>:UW, vgrf16+0.0:UW group0
mov(8) vgrf46<2>:UW, vgrf16+0.16:UW group8

into this :

send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD
mov(8) vgrf41<2>:HF, |vgrf10+0.0|<2>:HF group0
mov(8) vgrf42<2>:HF, |vgrf10+1.0|<2>:HF group8
mov(8) vgrf45<2>:HF, |vgrf10+0.2|<2>:HF group0
mov(8) vgrf46<2>:HF, |vgrf10+1.2|<2>:HF group8

Because of the floating point use, stride and offets should be the
same.

v2: Fix final destination type selection (Curro)

v3: constify (Curro)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9832>
This commit is contained in:
Lionel Landwerlin 2021-03-24 09:56:42 +02:00 committed by Marge Bot
parent 0b35987895
commit aa53665fda
2 changed files with 20 additions and 6 deletions

View File

@ -367,7 +367,8 @@ is_logic_op(enum opcode opcode)
}
static bool
can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
can_take_stride(fs_inst *inst, brw_reg_type dst_type,
unsigned arg, unsigned stride,
const gen_device_info *devinfo)
{
if (stride > 4)
@ -377,9 +378,9 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
* of the corresponding channel of the destination, and the provided stride
* would break this restriction.
*/
if (has_dst_aligned_region_restriction(devinfo, inst) &&
if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) &&
!(type_sz(inst->src[arg].type) * stride ==
type_sz(inst->dst.type) * inst->dst.stride ||
type_sz(dst_type) * inst->dst.stride ||
stride == 0))
return false;
@ -534,10 +535,15 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
if (instruction_requires_packed_data(inst) && entry_stride != 1)
return false;
const brw_reg_type dst_type = (has_source_modifiers &&
entry->dst.type != inst->src[arg].type) ?
entry->dst.type : inst->dst.type;
/* Bail if the result of composing both strides would exceed the
* hardware limit.
*/
if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride,
if (!can_take_stride(inst, dst_type, arg,
entry_stride * inst->src[arg].stride,
devinfo))
return false;

View File

@ -553,7 +553,8 @@ is_unordered(const fs_inst *inst)
*/
static inline bool
has_dst_aligned_region_restriction(const gen_device_info *devinfo,
const fs_inst *inst)
const fs_inst *inst,
brw_reg_type dst_type)
{
const brw_reg_type exec_type = get_exec_type(inst);
/* Even though the hardware spec claims that "integer DWord multiply"
@ -567,13 +568,20 @@ has_dst_aligned_region_restriction(const gen_device_info *devinfo,
(inst->opcode == BRW_OPCODE_MAD &&
MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
if (type_sz(inst->dst.type) > 4 || type_sz(exec_type) > 4 ||
if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
(type_sz(exec_type) == 4 && is_dword_multiply))
return devinfo->is_cherryview || gen_device_info_is_9lp(devinfo);
else
return false;
}
static inline bool
has_dst_aligned_region_restriction(const gen_device_info *devinfo,
const fs_inst *inst)
{
return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
}
/**
* Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
* the specified register file into a VGRF.