From 15e25da3e577745f5269f941d34da79a60cd144e Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 20 Nov 2019 16:42:17 +0000 Subject: [PATCH] aco: take advantage of GFX10's constant bus limit and VOP3 literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pipeline-db (Navi): Totals from affected shaders: SGPRS: 2397159 -> 2392494 (-0.19 %) VGPRS: 1756036 -> 1753920 (-0.12 %) Spilled SGPRs: 461 -> 470 (1.95 %) Spilled VGPRs: 0 -> 0 (0.00 %) Code Size: 110287304 -> 109946304 (-0.31 %) bytes Max Waves: 318341 -> 318475 (0.04 %) Instructions: 21019327 -> 20533618 (-2.31 %) pipeline-db (Vega): Totals from affected shaders: SGPRS: 0 -> 0 (0.00 %) VGPRS: 0 -> 0 (0.00 %) Spilled SGPRs: 0 -> 0 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Code Size: 0 -> 0 (0.00 %) bytes Max Waves: 0 -> 0 (0.00 %) Instructions: 0 -> 0 (0.00 %) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 64 +++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 88075fabfb2..c6c8931a426 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -414,12 +414,12 @@ bool can_swap_operands(aco_ptr& instr) } } -bool can_use_VOP3(aco_ptr& instr) +bool can_use_VOP3(opt_ctx& ctx, aco_ptr& instr) { if (instr->isVOP3()) return true; - if (instr->operands.size() && instr->operands[0].isLiteral()) + if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10) return false; if (instr->isDPP() || instr->isSDWA()) @@ -452,7 +452,6 @@ void to_VOP3(opt_ctx& ctx, aco_ptr& instr) if (instr->isVOP3()) return; - assert(!instr->operands[0].isLiteral()); aco_ptr tmp = std::move(instr); Format format = asVOP3(tmp->format); instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); @@ -508,7 +507,9 @@ bool valu_can_accept_vgpr(aco_ptr& instr, unsigned operand) /* check constant bus and literal limitations */ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) { - int limit = 1; + int limit = ctx.program->chip_class >= GFX10 ? 2 : 1; + Operand literal32(s1); + Operand literal64(s2); unsigned num_sgprs = 0; unsigned sgpr[] = {0, 0}; @@ -525,7 +526,26 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) return false; } } else if (op.isLiteral()) { - return false; + if (ctx.program->chip_class < GFX10) + return false; + + if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue()) + return false; + if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue()) + return false; + + /* Any number of 32-bit literals counts as only 1 to the limit. Same + * (but separately) for 64-bit literals. */ + if (op.size() == 1 && literal32.isUndefined()) { + limit--; + literal32 = op; + } else if (op.size() == 2 && literal64.isUndefined()) { + limit--; + literal64 = op; + } + + if (limit < 0) + return false; } } @@ -650,7 +670,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[i].setTemp(info.temp); info = ctx.info[info.temp.id()]; } - if (info.is_abs() && (can_use_VOP3(instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + if (info.is_abs() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { if (!instr->isDPP()) to_VOP3(ctx, instr); instr->operands[i] = Operand(info.temp); @@ -663,7 +683,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32; instr->operands[i].setTemp(info.temp); continue; - } else if (info.is_neg() && (can_use_VOP3(instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { + } else if (info.is_neg() && (can_use_VOP3(ctx, instr) || instr->isDPP()) && instr_info.can_use_input_modifiers[(int)instr->opcode]) { if (!instr->isDPP()) to_VOP3(ctx, instr); instr->operands[i].setTemp(info.temp); @@ -682,7 +702,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[i] = instr->operands[0]; instr->operands[0] = get_constant_op(ctx, info.val); continue; - } else if (can_use_VOP3(instr)) { + } else if (can_use_VOP3(ctx, instr)) { to_VOP3(ctx, instr); instr->operands[i] = get_constant_op(ctx, info.val); continue; @@ -1234,8 +1254,8 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) if (op[1].type() == RegType::sgpr) std::swap(op[0], op[1]); - //TODO: we can use two different SGPRs on GFX10 - if (op[0].type() == RegType::sgpr && op[1].type() == RegType::sgpr) + unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr); + if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1)) return false; ctx.uses[op[0].id()]++; @@ -1245,7 +1265,7 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) aco_opcode new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; Instruction *new_instr; - if (neg[0] || neg[1] || abs[0] || abs[1] || opsel) { + if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) { VOP3A_instruction *vop3 = create_instruction(new_op, asVOP3(Format::VOPC), 2, 1); for (unsigned i = 0; i < 2; i++) { vop3->neg[i] = neg[i]; @@ -1895,6 +1915,10 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) { + bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || + instr->opcode == aco_opcode::v_lshrrev_b64 || + instr->opcode == aco_opcode::v_ashrrev_i64; + /* find candidates and create the set of sgprs already read */ unsigned sgpr_ids[2] = {0, 0}; uint32_t operand_mask = 0; @@ -1913,6 +1937,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) operand_mask |= 1u << i; } unsigned max_sgprs = 1; + if (ctx.program->chip_class >= GFX10 && !is_shift64) + max_sgprs = 2; if (has_literal) max_sgprs--; @@ -1953,7 +1979,7 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) /* swap bits using a 4-entry LUT */ uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf; operand_mask = (operand_mask & ~0x3) | swapped; - } else if (can_use_VOP3(instr)) { + } else if (can_use_VOP3(ctx, instr)) { to_VOP3(ctx, instr); instr->operands[sgpr_idx] = Operand(sgpr); } else { @@ -2041,7 +2067,7 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 && - can_use_VOP3(instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) { + can_use_VOP3(ctx, instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) { ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; if (can_use_omod && def_info.is_omod2() && ctx.uses[def_info.temp.id()]) { to_VOP3(ctx, instr); @@ -2376,7 +2402,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) if (!instr->isSALU() && !instr->isVALU()) return; - if (instr->isSDWA() || instr->isDPP() || instr->isVOP3()) + if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10)) return; /* some encodings can't ever take literals */ /* we do not apply the literals yet as we don't know if it is profitable */ @@ -2385,7 +2411,9 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) unsigned literal_id = 0; unsigned literal_uses = UINT32_MAX; Operand literal(s1); - unsigned num_operands = instr->isSALU() ? instr->operands.size() : 1; + unsigned num_operands = 1; + if (instr->isSALU() || (ctx.program->chip_class >= GFX10 && can_use_VOP3(ctx, instr))) + num_operands = instr->operands.size(); unsigned sgpr_ids[2] = {0, 0}; bool is_literal_sgpr = false; @@ -2420,7 +2448,13 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) /* don't go over the constant bus limit */ + bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 || + instr->opcode == aco_opcode::v_lshrrev_b64 || + instr->opcode == aco_opcode::v_ashrrev_i64; unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX; + if (ctx.program->chip_class >= GFX10 && !is_shift64) + const_bus_limit = 2; + unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1]; if (num_sgprs == const_bus_limit && !is_literal_sgpr) return;