aco: remove DPP when applying constants/literals/sgprs

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12601>
This commit is contained in:
Rhys Perry 2021-08-30 10:30:45 +01:00 committed by Marge Bot
parent 7d95f7510f
commit 33ddbd220f
3 changed files with 44 additions and 14 deletions

View File

@ -290,6 +290,12 @@ asSDWA(Format format)
return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
}
constexpr Format
withoutDPP(Format format)
{
return (Format)((uint32_t)format & ~(uint32_t)Format::DPP);
}
enum class RegType {
none = 0,
sgpr,

View File

@ -560,10 +560,11 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsi
return true;
}
/* This expects the DPP modifier to be removed. */
bool
can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP())
if (instr->isSDWA() && ctx.program->chip_class < GFX9)
return false;
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
instr->opcode != aco_opcode::v_readlane_b32 &&
@ -1010,6 +1011,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
/* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
instr->operands.size() == 1) {
instr->format = withoutDPP(instr->format);
instr->operands[i].setTemp(info.temp);
info = ctx.info[info.temp.id()];
}
@ -1058,13 +1060,14 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
unsigned bits = get_operand_size(instr, i);
if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
(!instr->isSDWA() || ctx.program->chip_class >= GFX9) && !instr->isDPP()) {
(!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
Operand op = get_constant_op(ctx, info, bits);
perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
"v_cndmask_b32 with a constant selector", instr.get());
if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||
instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32) {
instr->format = withoutDPP(instr->format);
instr->operands[i] = op;
continue;
} else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
@ -2740,6 +2743,9 @@ apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (new_sgpr && num_sgprs >= max_sgprs)
continue;
if (sgpr_idx == 0)
instr->format = withoutDPP(instr->format);
if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
info.is_extract()) {
/* can_apply_extract() checks SGPR encoding restrictions */
@ -3734,7 +3740,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
}
}
if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
if (instr->isSDWA() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
(instr->isVOP3P() && ctx.program->chip_class < GFX10))
return; /* some encodings can't ever take literals */
@ -3858,6 +3864,7 @@ apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
unsigned bits = get_operand_size(instr, i);
if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
Operand literal = Operand::c32(ctx.info[op.tempId()].val);
instr->format = withoutDPP(instr->format);
if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
to_VOP3(ctx, instr);
instr->operands[i] = literal;

View File

@ -1037,23 +1037,40 @@ BEGIN_TEST(optimize.dpp_prop)
if (!setup_cs("v1 s1", GFX10))
return;
//! v1: %zero = p_parallelcopy 0
//! v1: %res0 = v_mul_f32 %zero, %a row_shl:1 bound_ctrl:1
//! v1: %one = p_parallelcopy 1
//! v1: %res0 = v_mul_f32 1, %a
//! p_unit_test 0, %res0
Temp zero = bld.copy(bld.def(v1), Operand::zero());
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), zero, inputs[0], dpp_row_sl(1)));
Temp one = bld.copy(bld.def(v1), Operand::c32(1));
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
//! v1: %literal = p_parallelcopy 0x12345678
//! v1: %res1 = v_mul_f32 %literal, %a row_shl:1 bound_ctrl:1
//! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
//! p_unit_test 1, %res1
Temp literal = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal, inputs[0], dpp_row_sl(1)));
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
//! v1: %res2 = v_mul_f32 0x12345678, %a
//! p_unit_test 2, %res2
Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
//! v1: %literal2 = p_parallelcopy 0x12345679
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
//! p_unit_test 3, %res3
Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
//! v1: %b_v = p_parallelcopy %b
//! v1: %res2 = v_mul_f32 %b_v, %a row_shl:1 bound_ctrl:1
//! p_unit_test 2, %res2
//! v1: %res4 = v_mul_f32 %b, %a
//! p_unit_test 4, %res4
Temp b_v = bld.copy(bld.def(v1), inputs[1]);
writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
//! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
//! p_unit_test 5, %res5
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
//! v1: %res6 = v_rcp_f32 %b
//! p_unit_test 6, %res6
writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
finish_opt_test();
END_TEST