aco: implement 8/16-bit instructions which can be trivially widened

When nir_lower_bit_size becomes more capable, we might want to revert some
of this.

fossil-db (parallel-rdp, Navi):
Totals from 217 (31.77% of 683) affected shaders:
SGPRs: 11320 -> 10200 (-9.89%)
VGPRs: 7156 -> 7364 (+2.91%)
CodeSize: 1453948 -> 1430136 (-1.64%); split: -1.66%, +0.02%
Instrs: 258530 -> 254840 (-1.43%); split: -1.44%, +0.01%
Cycles: 37334360 -> 37247936 (-0.23%); split: -0.26%, +0.03%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4791>
This commit is contained in:
Rhys Perry 2020-07-08 19:19:43 +01:00 committed by Marge Bot
parent ef95ba8cdd
commit 786828131a
2 changed files with 18 additions and 23 deletions

View File

@ -1224,7 +1224,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
/* Don't use s_andn2 here, this allows the optimizer to make a better decision */
Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
} else if (dst.regClass() == v1) {
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
} else if (dst.regClass() == v2) {
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
@ -1365,7 +1365,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
case nir_op_ior: {
if (instr->dest.dest.ssa.bit_size == 1) {
emit_boolean_logic(ctx, instr, Builder::s_or, dst);
} else if (dst.regClass() == v1) {
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
} else if (dst.regClass() == v2) {
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
@ -1381,7 +1381,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
case nir_op_iand: {
if (instr->dest.dest.ssa.bit_size == 1) {
emit_boolean_logic(ctx, instr, Builder::s_and, dst);
} else if (dst.regClass() == v1) {
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
} else if (dst.regClass() == v2) {
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
@ -1397,7 +1397,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
case nir_op_ixor: {
if (instr->dest.dest.ssa.bit_size == 1) {
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
} else if (dst.regClass() == v1) {
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
} else if (dst.regClass() == v2) {
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
@ -1527,17 +1527,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
break;
} else if (dst.regClass() == v2b && ctx->program->chip_class < GFX10) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
break;
} else if (dst.regClass() == v2b) {
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
break;
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
break;
}
Temp src0 = get_alu_src(ctx, instr->src[0]);
Temp src1 = get_alu_src(ctx, instr->src[1]);
if (dst.regClass() == v1) {
if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
break;
}
@ -1649,13 +1649,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
if (dst.regClass() == v1) {
bld.vsub32(Definition(dst), src0, src1);
break;
} else if (dst.regClass() == v2b) {
} else if (dst.bytes() <= 2) {
if (ctx->program->chip_class >= GFX10)
bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
else if (src1.type() == RegType::sgpr)
bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
else
else if (ctx->program->chip_class >= GFX8)
bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
else
bld.vsub32(Definition(dst), src0, src1);
break;
}
@ -1714,7 +1716,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
break;
}
case nir_op_imul: {
if (dst.regClass() == v1) {
if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
} else if (dst.type() == RegType::vgpr) {
uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
@ -1723,10 +1729,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
} else {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
}
} else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
} else if (dst.regClass() == v2b) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
} else if (dst.regClass() == s1) {
emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
} else {

View File

@ -2957,7 +2957,6 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_)
unsigned bit_size = alu->dest.dest.ssa.bit_size;
switch (alu->op) {
case nir_op_iabs:
case nir_op_iand:
case nir_op_bitfield_select:
case nir_op_udiv:
case nir_op_idiv:
@ -2966,11 +2965,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_)
case nir_op_imul_high:
case nir_op_umul_high:
case nir_op_ineg:
case nir_op_inot:
case nir_op_ior:
case nir_op_irem:
case nir_op_isign:
case nir_op_ixor:
return 32;
case nir_op_imax:
case nir_op_umax:
@ -2979,10 +2975,7 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_)
case nir_op_ishr:
case nir_op_ushr:
case nir_op_ishl:
case nir_op_iadd:
case nir_op_uadd_sat:
case nir_op_isub:
case nir_op_imul:
return (bit_size == 8 ||
!(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32 : 0;
default: