From 33ddbd220f26391fd117f484f6b566d17d942091 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 30 Aug 2021 10:30:45 +0100 Subject: [PATCH] aco: remove DPP when applying constants/literals/sgprs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Part-of: --- src/amd/compiler/aco_ir.h | 6 ++++ src/amd/compiler/aco_optimizer.cpp | 13 ++++++-- src/amd/compiler/tests/test_optimizer.cpp | 39 ++++++++++++++++------- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 2b0f7d34550..c96dcce7892 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -290,6 +290,12 @@ asSDWA(Format format) return (Format)((uint32_t)Format::SDWA | (uint32_t)format); } +constexpr Format +withoutDPP(Format format) +{ + return (Format)((uint32_t)format & ~(uint32_t)Format::DPP); +} + enum class RegType { none = 0, sgpr, diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 634c0939ea1..27c993e83fc 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -560,10 +560,11 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr& instr, Temp temp, unsi return true; } +/* This expects the DPP modifier to be removed. */ bool can_apply_sgprs(opt_ctx& ctx, aco_ptr& instr) { - if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP()) + if (instr->isSDWA() && ctx.program->chip_class < GFX9) return false; return instr->opcode != aco_opcode::v_readfirstlane_b32 && instr->opcode != aco_opcode::v_readlane_b32 && @@ -1010,6 +1011,7 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */ if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) && instr->operands.size() == 1) { + instr->format = withoutDPP(instr->format); instr->operands[i].setTemp(info.temp); info = ctx.info[info.temp.id()]; } @@ -1058,13 +1060,14 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) unsigned bits = get_operand_size(instr, i); if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) && - (!instr->isSDWA() || ctx.program->chip_class >= GFX9) && !instr->isDPP()) { + (!instr->isSDWA() || ctx.program->chip_class >= GFX9)) { Operand op = get_constant_op(ctx, info, bits); perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); if (i == 0 || instr->isSDWA() || instr->isVOP3P() || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) { + instr->format = withoutDPP(instr->format); instr->operands[i] = op; continue; } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) { @@ -2740,6 +2743,9 @@ apply_sgprs(opt_ctx& ctx, aco_ptr& instr) if (new_sgpr && num_sgprs >= max_sgprs) continue; + if (sgpr_idx == 0) + instr->format = withoutDPP(instr->format); + if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() || info.is_extract()) { /* can_apply_extract() checks SGPR encoding restrictions */ @@ -3734,7 +3740,7 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) } } - if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) || + if (instr->isSDWA() || (instr->isVOP3() && ctx.program->chip_class < GFX10) || (instr->isVOP3P() && ctx.program->chip_class < GFX10)) return; /* some encodings can't ever take literals */ @@ -3858,6 +3864,7 @@ apply_literals(opt_ctx& ctx, aco_ptr& instr) unsigned bits = get_operand_size(instr, i); if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) { Operand literal = Operand::c32(ctx.info[op.tempId()].val); + instr->format = withoutDPP(instr->format); if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P) to_VOP3(ctx, instr); instr->operands[i] = literal; diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index ce4e925b779..9609fea4f2b 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1037,23 +1037,40 @@ BEGIN_TEST(optimize.dpp_prop) if (!setup_cs("v1 s1", GFX10)) return; - //! v1: %zero = p_parallelcopy 0 - //! v1: %res0 = v_mul_f32 %zero, %a row_shl:1 bound_ctrl:1 + //! v1: %one = p_parallelcopy 1 + //! v1: %res0 = v_mul_f32 1, %a //! p_unit_test 0, %res0 - Temp zero = bld.copy(bld.def(v1), Operand::zero()); - writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), zero, inputs[0], dpp_row_sl(1))); + Temp one = bld.copy(bld.def(v1), Operand::c32(1)); + writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1))); - //! v1: %literal = p_parallelcopy 0x12345678 - //! v1: %res1 = v_mul_f32 %literal, %a row_shl:1 bound_ctrl:1 + //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1 //! p_unit_test 1, %res1 - Temp literal = bld.copy(bld.def(v1), Operand::c32(0x12345678u)); - writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal, inputs[0], dpp_row_sl(1))); + writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1))); + + //! v1: %res2 = v_mul_f32 0x12345678, %a + //! p_unit_test 2, %res2 + Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u)); + writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1))); + + //! v1: %literal2 = p_parallelcopy 0x12345679 + //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 + //! p_unit_test 3, %res3 + Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u)); + writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1))); //! v1: %b_v = p_parallelcopy %b - //! v1: %res2 = v_mul_f32 %b_v, %a row_shl:1 bound_ctrl:1 - //! p_unit_test 2, %res2 + //! v1: %res4 = v_mul_f32 %b, %a + //! p_unit_test 4, %res4 Temp b_v = bld.copy(bld.def(v1), inputs[1]); - writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1))); + writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1))); + + //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1 + //! p_unit_test 5, %res5 + writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1))); + + //! v1: %res6 = v_rcp_f32 %b + //! p_unit_test 6, %res6 + writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1))); finish_opt_test(); END_TEST