aco: remove DPP when applying constants/literals/sgprs
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12601>
This commit is contained in:
parent
7d95f7510f
commit
33ddbd220f
|
@ -290,6 +290,12 @@ asSDWA(Format format)
|
|||
return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
|
||||
}
|
||||
|
||||
constexpr Format
|
||||
withoutDPP(Format format)
|
||||
{
|
||||
return (Format)((uint32_t)format & ~(uint32_t)Format::DPP);
|
||||
}
|
||||
|
||||
enum class RegType {
|
||||
none = 0,
|
||||
sgpr,
|
||||
|
|
|
@ -560,10 +560,11 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsi
|
|||
return true;
|
||||
}
|
||||
|
||||
/* This expects the DPP modifier to be removed. */
|
||||
bool
|
||||
can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP())
|
||||
if (instr->isSDWA() && ctx.program->chip_class < GFX9)
|
||||
return false;
|
||||
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
||||
instr->opcode != aco_opcode::v_readlane_b32 &&
|
||||
|
@ -1010,6 +1011,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
/* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
|
||||
if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
|
||||
instr->operands.size() == 1) {
|
||||
instr->format = withoutDPP(instr->format);
|
||||
instr->operands[i].setTemp(info.temp);
|
||||
info = ctx.info[info.temp.id()];
|
||||
}
|
||||
|
@ -1058,13 +1060,14 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
|
||||
unsigned bits = get_operand_size(instr, i);
|
||||
if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
|
||||
(!instr->isSDWA() || ctx.program->chip_class >= GFX9) && !instr->isDPP()) {
|
||||
(!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
|
||||
Operand op = get_constant_op(ctx, info, bits);
|
||||
perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
|
||||
"v_cndmask_b32 with a constant selector", instr.get());
|
||||
if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32) {
|
||||
instr->format = withoutDPP(instr->format);
|
||||
instr->operands[i] = op;
|
||||
continue;
|
||||
} else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
|
||||
|
@ -2740,6 +2743,9 @@ apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
if (new_sgpr && num_sgprs >= max_sgprs)
|
||||
continue;
|
||||
|
||||
if (sgpr_idx == 0)
|
||||
instr->format = withoutDPP(instr->format);
|
||||
|
||||
if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
|
||||
info.is_extract()) {
|
||||
/* can_apply_extract() checks SGPR encoding restrictions */
|
||||
|
@ -3734,7 +3740,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
}
|
||||
}
|
||||
|
||||
if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
|
||||
if (instr->isSDWA() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
|
||||
(instr->isVOP3P() && ctx.program->chip_class < GFX10))
|
||||
return; /* some encodings can't ever take literals */
|
||||
|
||||
|
@ -3858,6 +3864,7 @@ apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
unsigned bits = get_operand_size(instr, i);
|
||||
if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
|
||||
Operand literal = Operand::c32(ctx.info[op.tempId()].val);
|
||||
instr->format = withoutDPP(instr->format);
|
||||
if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
|
||||
to_VOP3(ctx, instr);
|
||||
instr->operands[i] = literal;
|
||||
|
|
|
@ -1037,23 +1037,40 @@ BEGIN_TEST(optimize.dpp_prop)
|
|||
if (!setup_cs("v1 s1", GFX10))
|
||||
return;
|
||||
|
||||
//! v1: %zero = p_parallelcopy 0
|
||||
//! v1: %res0 = v_mul_f32 %zero, %a row_shl:1 bound_ctrl:1
|
||||
//! v1: %one = p_parallelcopy 1
|
||||
//! v1: %res0 = v_mul_f32 1, %a
|
||||
//! p_unit_test 0, %res0
|
||||
Temp zero = bld.copy(bld.def(v1), Operand::zero());
|
||||
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), zero, inputs[0], dpp_row_sl(1)));
|
||||
Temp one = bld.copy(bld.def(v1), Operand::c32(1));
|
||||
writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
|
||||
|
||||
//! v1: %literal = p_parallelcopy 0x12345678
|
||||
//! v1: %res1 = v_mul_f32 %literal, %a row_shl:1 bound_ctrl:1
|
||||
//! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
|
||||
//! p_unit_test 1, %res1
|
||||
Temp literal = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
|
||||
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal, inputs[0], dpp_row_sl(1)));
|
||||
writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
|
||||
|
||||
//! v1: %res2 = v_mul_f32 0x12345678, %a
|
||||
//! p_unit_test 2, %res2
|
||||
Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
|
||||
writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
|
||||
|
||||
//! v1: %literal2 = p_parallelcopy 0x12345679
|
||||
//! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
|
||||
//! p_unit_test 3, %res3
|
||||
Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
|
||||
writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
|
||||
|
||||
//! v1: %b_v = p_parallelcopy %b
|
||||
//! v1: %res2 = v_mul_f32 %b_v, %a row_shl:1 bound_ctrl:1
|
||||
//! p_unit_test 2, %res2
|
||||
//! v1: %res4 = v_mul_f32 %b, %a
|
||||
//! p_unit_test 4, %res4
|
||||
Temp b_v = bld.copy(bld.def(v1), inputs[1]);
|
||||
writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
|
||||
writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
|
||||
|
||||
//! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
|
||||
//! p_unit_test 5, %res5
|
||||
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
|
||||
|
||||
//! v1: %res6 = v_rcp_f32 %b
|
||||
//! p_unit_test 6, %res6
|
||||
writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
|
||||
|
||||
finish_opt_test();
|
||||
END_TEST
|
||||
|
|
Loading…
Reference in New Issue