diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index 343bac23889..6be8d8d8693 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -367,7 +367,8 @@ is_logic_op(enum opcode opcode) } static bool -can_take_stride(fs_inst *inst, unsigned arg, unsigned stride, +can_take_stride(fs_inst *inst, brw_reg_type dst_type, + unsigned arg, unsigned stride, const gen_device_info *devinfo) { if (stride > 4) @@ -377,9 +378,9 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride, * of the corresponding channel of the destination, and the provided stride * would break this restriction. */ - if (has_dst_aligned_region_restriction(devinfo, inst) && + if (has_dst_aligned_region_restriction(devinfo, inst, dst_type) && !(type_sz(inst->src[arg].type) * stride == - type_sz(inst->dst.type) * inst->dst.stride || + type_sz(dst_type) * inst->dst.stride || stride == 0)) return false; @@ -534,10 +535,15 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (instruction_requires_packed_data(inst) && entry_stride != 1) return false; + const brw_reg_type dst_type = (has_source_modifiers && + entry->dst.type != inst->src[arg].type) ? + entry->dst.type : inst->dst.type; + /* Bail if the result of composing both strides would exceed the * hardware limit. */ - if (!can_take_stride(inst, arg, entry_stride * inst->src[arg].stride, + if (!can_take_stride(inst, dst_type, arg, + entry_stride * inst->src[arg].stride, devinfo)) return false; diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 60f0c8bfa77..6becef748ce 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -553,7 +553,8 @@ is_unordered(const fs_inst *inst) */ static inline bool has_dst_aligned_region_restriction(const gen_device_info *devinfo, - const fs_inst *inst) + const fs_inst *inst, + brw_reg_type dst_type) { const brw_reg_type exec_type = get_exec_type(inst); /* Even though the hardware spec claims that "integer DWord multiply" @@ -567,13 +568,20 @@ has_dst_aligned_region_restriction(const gen_device_info *devinfo, (inst->opcode == BRW_OPCODE_MAD && MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4)); - if (type_sz(inst->dst.type) > 4 || type_sz(exec_type) > 4 || + if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 || (type_sz(exec_type) == 4 && is_dword_multiply)) return devinfo->is_cherryview || gen_device_info_is_9lp(devinfo); else return false; } +static inline bool +has_dst_aligned_region_restriction(const gen_device_info *devinfo, + const fs_inst *inst) +{ + return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type); +} + /** * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from * the specified register file into a VGRF.