aco: always use p_parallelcopy for pre-RA copies

Most fossil-db changes are because literals are applied earlier
(in label_instruction), so use counts are more accurate and more literals
are applied.

fossil-db (Navi):
Totals from 79551 (57.89% of 137413) affected shaders:
SGPRs: 4549610 -> 4542802 (-0.15%); split: -0.19%, +0.04%
VGPRs: 3326764 -> 3324172 (-0.08%); split: -0.10%, +0.03%
SpillSGPRs: 38886 -> 34562 (-11.12%); split: -11.14%, +0.02%
CodeSize: 240143456 -> 240001008 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 1078919 -> 1079281 (+0.03%); split: +0.04%, -0.01%
Instrs: 46627073 -> 46528490 (-0.21%); split: -0.22%, +0.01%

fossil-db (Polaris):
Totals from 98463 (70.90% of 138881) affected shaders:
SGPRs: 5164689 -> 5164353 (-0.01%); split: -0.02%, +0.01%
VGPRs: 3920936 -> 3921856 (+0.02%); split: -0.00%, +0.03%
SpillSGPRs: 56298 -> 52259 (-7.17%); split: -7.22%, +0.04%
CodeSize: 258680092 -> 258692712 (+0.00%); split: -0.02%, +0.03%
MaxWaves: 620863 -> 620823 (-0.01%); split: +0.00%, -0.01%
Instrs: 50776289 -> 50757577 (-0.04%); split: -0.04%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7216>
This commit is contained in:
Rhys Perry 2020-10-14 13:50:24 +01:00 committed by Marge Bot
parent 6db5fbf9f2
commit e54c111c45
6 changed files with 13 additions and 84 deletions

View File

@ -85,8 +85,6 @@ ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
aco_ptr<Instruction> create_s_mov(Definition dst, Operand src);
extern uint8_t int8_mul_table[512];
enum sendmsg {
sendmsg_none = 0,
_sendmsg_gs = 2,
@ -386,82 +384,8 @@ public:
return v_mul_imm(dst, tmp, imm, true);
}
Result copy(Definition dst, Op op_) {
Operand op = op_.op;
assert(op.bytes() == dst.bytes());
if (dst.regClass() == s1 && op.size() == 1 && op.isLiteral()) {
uint32_t imm = op.constantValue();
if (imm == 0x3e22f983) {
if (program->chip_class >= GFX8)
op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
} else if (imm >= 0xffff8000 || imm <= 0x7fff) {
return sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
} else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
uint32_t rev = util_bitreverse(imm);
return dst.regClass() == v1 ?
vop1(aco_opcode::v_bfrev_b32, dst, Operand(rev)) :
sop1(aco_opcode::s_brev_b32, dst, Operand(rev));
} else if (imm != 0) {
unsigned start = (ffs(imm) - 1) & 0x1f;
unsigned size = util_bitcount(imm) & 0x1f;
if ((((1u << size) - 1u) << start) == imm)
return sop2(aco_opcode::s_bfm_b32, dst, Operand(size), Operand(start));
}
}
if (dst.regClass() == s1) {
return sop1(aco_opcode::s_mov_b32, dst, op);
} else if (dst.regClass() == s2) {
return sop1(aco_opcode::s_mov_b64, dst, op);
} else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) {
return vop1(aco_opcode::v_mov_b32, dst, op);
} else if (op.bytes() > 2 || (op.isLiteral() && dst.regClass().is_subdword())) {
return pseudo(aco_opcode::p_create_vector, dst, op);
} else if (op.bytes() == 1 && op.isConstant()) {
uint8_t val = op.constantValue();
Operand op32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
aco_ptr<SDWA_instruction> sdwa;
if (op32.isLiteral()) {
sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mul_u32_u24, asSDWA(Format::VOP2), 2, 1));
uint32_t a = (uint32_t)int8_mul_table[val * 2];
uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
sdwa->operands[0] = Operand(a | (a & 0x80u ? 0xffffff00u : 0x0u));
sdwa->operands[1] = Operand(b | (b & 0x80u ? 0xffffff00u : 0x0u));
} else {
sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1));
sdwa->operands[0] = op32;
}
sdwa->definitions[0] = dst;
sdwa->sel[0] = sdwa_udword;
sdwa->sel[1] = sdwa_udword;
sdwa->dst_sel = sdwa_ubyte;
sdwa->dst_preserve = true;
return insert(std::move(sdwa));
} else if (op.bytes() == 2 && op.isConstant() && !op.isLiteral()) {
aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_add_f16, asSDWA(Format::VOP2), 2, 1)};
sdwa->operands[0] = op;
sdwa->operands[1] = Operand(0u);
sdwa->definitions[0] = dst;
sdwa->sel[0] = sdwa_uword;
sdwa->sel[1] = sdwa_udword;
sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
sdwa->dst_preserve = true;
return insert(std::move(sdwa));
} else if (dst.regClass().is_subdword()) {
if (program->chip_class >= GFX8) {
aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
sdwa->operands[0] = op;
sdwa->definitions[0] = dst;
sdwa->sel[0] = op.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
sdwa->dst_preserve = true;
return insert(std::move(sdwa));
} else {
return vop1(aco_opcode::v_mov_b32, dst, op);
}
} else {
unreachable("Unhandled case in bld.copy()");
}
Result copy(Definition dst, Op op) {
return pseudo(aco_opcode::p_parallelcopy, dst, op);
}
Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op carry_in=Op(Operand(s2)), bool post_ra=false) {

View File

@ -109,6 +109,7 @@ bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
case aco_opcode::p_create_vector:
case aco_opcode::p_extract_vector:
case aco_opcode::p_split_vector:
case aco_opcode::p_parallelcopy:
for (Definition def : instr->definitions) {
if (def.getTemp().type() == RegType::vgpr)
return true;

View File

@ -393,6 +393,7 @@ void process_block(vn_ctx& ctx, Block& block)
instr->opcode == aco_opcode::s_mov_b32 ||
instr->opcode == aco_opcode::s_mov_b64 ||
instr->opcode == aco_opcode::v_mov_b32 ||
instr->opcode == aco_opcode::p_parallelcopy ||
(instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() &&
instr->operands[0].regClass() == instr->definitions[0].regClass() &&

View File

@ -853,7 +853,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
case aco_opcode::p_create_vector:
case aco_opcode::p_split_vector:
case aco_opcode::p_extract_vector:
case aco_opcode::p_phi: {
case aco_opcode::p_phi:
case aco_opcode::p_parallelcopy: {
const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(),
[] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;});
if (all_vgpr) {
@ -1212,6 +1213,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
case aco_opcode::s_mov_b32: /* propagate */
case aco_opcode::s_mov_b64:
case aco_opcode::v_mov_b32:
case aco_opcode::p_parallelcopy:
if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
instr->operands[0].regClass() != instr->definitions[0].regClass()) {
/* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so

View File

@ -245,8 +245,9 @@ bool should_rematerialize(aco_ptr<Instruction>& instr)
/* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK)
return false;
/* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */
if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector)
/* TODO: pseudo-instruction rematerialization is only supported for p_create_vector/p_parallelcopy */
if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector &&
instr->opcode != aco_opcode::p_parallelcopy)
return false;
if (instr->format == Format::SOPK && instr->opcode != aco_opcode::s_movk_i32)
return false;
@ -270,7 +271,7 @@ aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t
if (remat != ctx.remat.end()) {
Instruction *instr = remat->second.instr;
assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO || instr->format == Format::SOPK) && "unsupported");
assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector) && "unsupported");
assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_parallelcopy) && "unsupported");
assert(instr->definitions.size() == 1 && "unsupported");
aco_ptr<Instruction> res;

View File

@ -67,9 +67,9 @@ BEGIN_TEST(isel.compute.simple)
uint res;
};
void main() {
//~gfx7>> v1: %data = v_mov_b32 42
//~gfx7>> v1: %data = p_parallelcopy 42
//~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation
//~gfx8>> s1: %data = s_mov_b32 42
//~gfx8>> s1: %data = p_parallelcopy 42
//~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer semantics: scope:invocation
res = 42;
}