From 786828131a7c72ae1f9a21159255464ac7f4ae8b Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 8 Jul 2020 19:19:43 +0100 Subject: [PATCH] aco: implement 8/16-bit instructions which can be trivially widened MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When nir_lower_bit_size becomes more capable, we might want to revert some of this. fossil-db (parallel-rdp, Navi): Totals from 217 (31.77% of 683) affected shaders: SGPRs: 11320 -> 10200 (-9.89%) VGPRs: 7156 -> 7364 (+2.91%) CodeSize: 1453948 -> 1430136 (-1.64%); split: -1.66%, +0.02% Instrs: 258530 -> 254840 (-1.43%); split: -1.44%, +0.01% Cycles: 37334360 -> 37247936 (-0.23%); split: -0.26%, +0.03% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 34 ++++++++++--------- src/amd/vulkan/radv_pipeline.c | 7 ---- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index b2928c239b3..d1b7da5b5d0 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -1224,7 +1224,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) /* Don't use s_andn2 here, this allows the optimizer to make a better decision */ Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm)); - } else if (dst.regClass() == v1) { + } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); } else if (dst.regClass() == v2) { Temp lo = bld.tmp(v1), hi = bld.tmp(v1); @@ -1365,7 +1365,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_ior: { if (instr->dest.dest.ssa.bit_size == 1) { emit_boolean_logic(ctx, instr, Builder::s_or, dst); - } else if (dst.regClass() == v1) { + } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); } else if (dst.regClass() == v2) { emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst); @@ -1381,7 +1381,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_iand: { if (instr->dest.dest.ssa.bit_size == 1) { emit_boolean_logic(ctx, instr, Builder::s_and, dst); - } else if (dst.regClass() == v1) { + } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); } else if (dst.regClass() == v2) { emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst); @@ -1397,7 +1397,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_ixor: { if (instr->dest.dest.ssa.bit_size == 1) { emit_boolean_logic(ctx, instr, Builder::s_xor, dst); - } else if (dst.regClass() == v1) { + } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); } else if (dst.regClass() == v2) { emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst); @@ -1527,17 +1527,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true); break; - } else if (dst.regClass() == v2b && ctx->program->chip_class < GFX10) { - emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true); - break; - } else if (dst.regClass() == v2b) { + } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) { emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst); break; + } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true); + break; } Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); - if (dst.regClass() == v1) { + if (dst.type() == RegType::vgpr && dst.bytes() <= 4) { bld.vadd32(Definition(dst), Operand(src0), Operand(src1)); break; } @@ -1649,13 +1649,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) if (dst.regClass() == v1) { bld.vsub32(Definition(dst), src0, src1); break; - } else if (dst.regClass() == v2b) { + } else if (dst.bytes() <= 2) { if (ctx->program->chip_class >= GFX10) bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1); else if (src1.type() == RegType::sgpr) bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0)); - else + else if (ctx->program->chip_class >= GFX8) bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1)); + else + bld.vsub32(Definition(dst), src0, src1); break; } @@ -1714,7 +1716,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_imul: { - if (dst.regClass() == v1) { + if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) { + emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst); + } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) { + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true); + } else if (dst.type() == RegType::vgpr) { uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0); uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1); @@ -1723,10 +1729,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else { emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst); } - } else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { - emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst); - } else if (dst.regClass() == v2b) { - emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true); } else if (dst.regClass() == s1) { emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false); } else { diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index f57c54c2a04..c917b11c643 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2957,7 +2957,6 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_) unsigned bit_size = alu->dest.dest.ssa.bit_size; switch (alu->op) { case nir_op_iabs: - case nir_op_iand: case nir_op_bitfield_select: case nir_op_udiv: case nir_op_idiv: @@ -2966,11 +2965,8 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_) case nir_op_imul_high: case nir_op_umul_high: case nir_op_ineg: - case nir_op_inot: - case nir_op_ior: case nir_op_irem: case nir_op_isign: - case nir_op_ixor: return 32; case nir_op_imax: case nir_op_umax: @@ -2979,10 +2975,7 @@ lower_bit_size_callback(const nir_alu_instr *alu, void *_) case nir_op_ishr: case nir_op_ushr: case nir_op_ishl: - case nir_op_iadd: case nir_op_uadd_sat: - case nir_op_isub: - case nir_op_imul: return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32 : 0; default: