intel/fs: Emit better code for u2u of extract
Emitting the instructions one by one results in two MOV instructions that won't be propagated. By handling both instructions at once, a single MOV is emitted. For example, on Ice Lake this helps dEQP-VK.spirv_assembly.type.vec3.i8.bitwise_xor_frag: SIMD8 shader: 49 instructions. 1 loops. 4044 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 41 instructions. 1 loops. 3804 cycles. 0:0 spills:fills, 5 sends Without "intel/fs: Allow copy propagation between MOVs of mixed sizes," the improvement is still 8 instructions, but there are more instructions to begin with: SIMD8 shader: 52 instructions. 1 loops. 4164 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 44 instructions. 1 loops. 3944 cycles. 0:0 spills:fills, 5 sends Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9025>
This commit is contained in:
parent
e3f502e007
commit
7c83aa0518
|
@ -1102,13 +1102,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
|
||||||
case nir_op_f2i32:
|
case nir_op_f2i32:
|
||||||
case nir_op_f2u32:
|
case nir_op_f2u32:
|
||||||
case nir_op_i2f16:
|
case nir_op_i2f16:
|
||||||
case nir_op_i2i16:
|
|
||||||
case nir_op_u2f16:
|
case nir_op_u2f16:
|
||||||
case nir_op_u2u16:
|
|
||||||
case nir_op_f2i16:
|
case nir_op_f2i16:
|
||||||
case nir_op_f2u16:
|
case nir_op_f2u16:
|
||||||
case nir_op_i2i8:
|
|
||||||
case nir_op_u2u8:
|
|
||||||
case nir_op_f2i8:
|
case nir_op_f2i8:
|
||||||
case nir_op_f2u8:
|
case nir_op_f2u8:
|
||||||
if (result.type == BRW_REGISTER_TYPE_B ||
|
if (result.type == BRW_REGISTER_TYPE_B ||
|
||||||
|
@ -1124,6 +1120,44 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
|
||||||
inst = bld.MOV(result, op[0]);
|
inst = bld.MOV(result, op[0]);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case nir_op_i2i8:
|
||||||
|
case nir_op_u2u8:
|
||||||
|
assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
|
||||||
|
FALLTHROUGH;
|
||||||
|
case nir_op_i2i16:
|
||||||
|
case nir_op_u2u16: {
|
||||||
|
/* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
|
||||||
|
* Emitting the instructions one by one results in two MOV instructions
|
||||||
|
* that won't be propagated. By handling both instructions here, a
|
||||||
|
* single MOV is emitted.
|
||||||
|
*/
|
||||||
|
nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
|
||||||
|
if (extract_instr != NULL) {
|
||||||
|
if (extract_instr->op == nir_op_extract_u8 ||
|
||||||
|
extract_instr->op == nir_op_extract_i8) {
|
||||||
|
prepare_alu_destination_and_sources(bld, extract_instr, op, false);
|
||||||
|
|
||||||
|
const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
|
||||||
|
const brw_reg_type type =
|
||||||
|
brw_int_type(1, extract_instr->op == nir_op_extract_i8);
|
||||||
|
|
||||||
|
op[0] = subscript(op[0], type, byte);
|
||||||
|
} else if (extract_instr->op == nir_op_extract_u16 ||
|
||||||
|
extract_instr->op == nir_op_extract_i16) {
|
||||||
|
prepare_alu_destination_and_sources(bld, extract_instr, op, false);
|
||||||
|
|
||||||
|
const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
|
||||||
|
const brw_reg_type type =
|
||||||
|
brw_int_type(2, extract_instr->op == nir_op_extract_i16);
|
||||||
|
|
||||||
|
op[0] = subscript(op[0], type, word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inst = bld.MOV(result, op[0]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case nir_op_fsat:
|
case nir_op_fsat:
|
||||||
inst = bld.MOV(result, op[0]);
|
inst = bld.MOV(result, op[0]);
|
||||||
inst->saturate = true;
|
inst->saturate = true;
|
||||||
|
|
Loading…
Reference in New Issue