aco: implement 8/16-bit instructions which can be trivially widened
When nir_lower_bit_size becomes more capable, we might want to revert some of this. fossil-db (parallel-rdp, Navi): Totals from 217 (31.77% of 683) affected shaders: SGPRs: 11320 -> 10200 (-9.89%) VGPRs: 7156 -> 7364 (+2.91%) CodeSize: 1453948 -> 1430136 (-1.64%); split: -1.66%, +0.02% Instrs: 258530 -> 254840 (-1.43%); split: -1.44%, +0.01% Cycles: 37334360 -> 37247936 (-0.23%); split: -0.26%, +0.03% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4791>
This commit is contained in:
parent
ef95ba8cdd
commit
786828131a
|
@ -1224,7 +1224,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
/* Don't use s_andn2 here, this allows the optimizer to make a better decision */
|
||||
Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
|
||||
bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
|
||||
} else if (dst.regClass() == v1) {
|
||||
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
|
||||
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
|
||||
} else if (dst.regClass() == v2) {
|
||||
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
||||
|
@ -1365,7 +1365,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
case nir_op_ior: {
|
||||
if (instr->dest.dest.ssa.bit_size == 1) {
|
||||
emit_boolean_logic(ctx, instr, Builder::s_or, dst);
|
||||
} else if (dst.regClass() == v1) {
|
||||
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
|
||||
} else if (dst.regClass() == v2) {
|
||||
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
|
||||
|
@ -1381,7 +1381,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
case nir_op_iand: {
|
||||
if (instr->dest.dest.ssa.bit_size == 1) {
|
||||
emit_boolean_logic(ctx, instr, Builder::s_and, dst);
|
||||
} else if (dst.regClass() == v1) {
|
||||
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
|
||||
} else if (dst.regClass() == v2) {
|
||||
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
|
||||
|
@ -1397,7 +1397,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
case nir_op_ixor: {
|
||||
if (instr->dest.dest.ssa.bit_size == 1) {
|
||||
emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
|
||||
} else if (dst.regClass() == v1) {
|
||||
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
|
||||
} else if (dst.regClass() == v2) {
|
||||
emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
|
||||
|
@ -1527,17 +1527,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
if (dst.regClass() == s1) {
|
||||
emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
|
||||
break;
|
||||
} else if (dst.regClass() == v2b && ctx->program->chip_class < GFX10) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
|
||||
break;
|
||||
} else if (dst.regClass() == v2b) {
|
||||
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
|
||||
emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
|
||||
break;
|
||||
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
|
||||
break;
|
||||
}
|
||||
|
||||
Temp src0 = get_alu_src(ctx, instr->src[0]);
|
||||
Temp src1 = get_alu_src(ctx, instr->src[1]);
|
||||
if (dst.regClass() == v1) {
|
||||
if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
|
||||
bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
|
||||
break;
|
||||
}
|
||||
|
@ -1649,13 +1649,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
if (dst.regClass() == v1) {
|
||||
bld.vsub32(Definition(dst), src0, src1);
|
||||
break;
|
||||
} else if (dst.regClass() == v2b) {
|
||||
} else if (dst.bytes() <= 2) {
|
||||
if (ctx->program->chip_class >= GFX10)
|
||||
bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
|
||||
else if (src1.type() == RegType::sgpr)
|
||||
bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
|
||||
else
|
||||
else if (ctx->program->chip_class >= GFX8)
|
||||
bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
|
||||
else
|
||||
bld.vsub32(Definition(dst), src0, src1);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1714,7 +1716,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
break;
|
||||
}
|
||||
case nir_op_imul: {
|
||||
if (dst.regClass() == v1) {
|
||||
if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
|
||||
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
|
||||
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
|
||||
} else if (dst.type() == RegType::vgpr) {
|
||||
uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
|
||||
uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
|
||||
|
||||
|
@ -1723,10 +1729,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
|
|||
} else {
|
||||
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
|
||||
}
|
||||
} else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
|
||||
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
|
||||
} else if (dst.regClass() == v2b) {
|
||||
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
|
||||
} else if (dst.regClass() == s1) {
|
||||
emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
|
||||
} else {
|
||||
|
|
|
@ -2957,7 +2957,6 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_)
|
|||
unsigned bit_size = alu->dest.dest.ssa.bit_size;
|
||||
switch (alu->op) {
|
||||
case nir_op_iabs:
|
||||
case nir_op_iand:
|
||||
case nir_op_bitfield_select:
|
||||
case nir_op_udiv:
|
||||
case nir_op_idiv:
|
||||
|
@ -2966,11 +2965,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_)
|
|||
case nir_op_imul_high:
|
||||
case nir_op_umul_high:
|
||||
case nir_op_ineg:
|
||||
case nir_op_inot:
|
||||
case nir_op_ior:
|
||||
case nir_op_irem:
|
||||
case nir_op_isign:
|
||||
case nir_op_ixor:
|
||||
return 32;
|
||||
case nir_op_imax:
|
||||
case nir_op_umax:
|
||||
|
@ -2979,10 +2975,7 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_)
|
|||
case nir_op_ishr:
|
||||
case nir_op_ushr:
|
||||
case nir_op_ishl:
|
||||
case nir_op_iadd:
|
||||
case nir_op_uadd_sat:
|
||||
case nir_op_isub:
|
||||
case nir_op_imul:
|
||||
return (bit_size == 8 ||
|
||||
!(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32 : 0;
|
||||
default:
|
||||
|
|
Loading…
Reference in New Issue