diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 0746a7bd85c..fe591c7b75c 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -1468,7 +1468,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.regClass() == v2b) { emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true); } else if (dst.regClass() == v1) { - emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true); + emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false, false, 1); } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0])); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 1deef785870..651fc272517 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -2682,7 +2682,8 @@ bool combine_and_subbrev(opt_ctx& ctx, aco_ptr& instr) return false; } -/* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1< v_mad_u32_u24(a, 1< v_mad_u32_u24(b, 1<& instr) { if (instr->usesModifiers()) @@ -2693,19 +2694,28 @@ bool combine_add_lshl(opt_ctx& ctx, aco_ptr& instr) if (!op_instr) continue; - if (op_instr->opcode != aco_opcode::s_lshl_b32) + if (op_instr->opcode != aco_opcode::s_lshl_b32 && + op_instr->opcode != aco_opcode::v_lshlrev_b32) continue; - if (op_instr->operands[1].isConstant() && - op_instr->operands[1].constantValue() <= 6 && /* no literals */ - (op_instr->operands[0].is24bit() || - op_instr->operands[0].is16bit())) { - uint32_t multiplier = 1 << op_instr->operands[1].constantValue(); + if (op_instr->opcode == aco_opcode::v_lshlrev_b32 && + op_instr->operands[1].isTemp() && + op_instr->operands[1].getTemp().type() == RegType::sgpr && + instr->operands[!i].isTemp() && + instr->operands[!i].getTemp().type() == RegType::sgpr) + return false; + + int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0; + if (op_instr->operands[shift_op_idx].isConstant() && + op_instr->operands[shift_op_idx].constantValue() <= 6 && /* no literals */ + (op_instr->operands[!shift_op_idx].is24bit() || + op_instr->operands[!shift_op_idx].is16bit())) { + uint32_t multiplier = 1 << op_instr->operands[shift_op_idx].constantValue(); ctx.uses[instr->operands[i].tempId()]--; aco_ptr new_instr{create_instruction(aco_opcode::v_mad_u32_u24, Format::VOP3A, 3, 1)}; - new_instr->operands[0] = op_instr->operands[0]; + new_instr->operands[0] = op_instr->operands[!shift_op_idx]; new_instr->operands[1] = Operand(multiplier); new_instr->operands[2] = instr->operands[!i]; new_instr->definitions[0] = instr->definitions[0]; diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 0520da2580e..da00fce832f 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -795,3 +795,63 @@ BEGIN_TEST(optimize.mad_32_24) finish_opt_test(); } END_TEST + +BEGIN_TEST(optimize.add_lshlrev) + for (unsigned i = GFX8; i <= GFX10; i++) { + //>> v1: %a, v1: %b, s1: %c, s2: %_:exec = p_startpgm + if (!setup_cs("v1 v1 s1", (chip_class)i)) + continue; + + Temp lshl; + + //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a + //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b + //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b + //! p_unit_test 0, %res0 + lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), Operand(inputs[0])); + writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); + + //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a + //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b + //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b + //! p_unit_test 1, %res1 + Operand a_24bit = Operand(inputs[0]); + a_24bit.set24bit(true); + lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), a_24bit); + writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); + + //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b + //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b + //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b + //! p_unit_test 2, %res2 + Operand b_24bit = Operand(inputs[1]); + b_24bit.set24bit(true); + lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit); + writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); + + //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b + //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b + //! p_unit_test 3, %res3 + lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), a_24bit); + writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); + + //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b + //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b + //! p_unit_test 4, %res4 + Operand a_16bit = Operand(inputs[0]); + a_16bit.set16bit(true); + lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(4u), a_16bit); + writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); + + //~gfx8! v1: %lshl5 = v_lshlrev_b32 4, (is24bit)%c + //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %c, %lshl5 + //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c + //! p_unit_test 5, %res5 + Operand c_24bit = Operand(inputs[2]); + c_24bit.set24bit(true); + lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(4u), c_24bit); + writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2]))); + + finish_opt_test(); + } +END_TEST