aco: don't use a scalar temporary for reductions on GFX10

This patch also adds the scalar temporary for scans on SI/CI

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
This commit is contained in:
Daniel Schürmann 2019-11-20 18:57:23 +01:00
parent 8ad43d8838
commit 9254fb4fc7
2 changed files with 3 additions and 3 deletions

View File

@ -481,8 +481,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
if (cluster_size == 64) {
for (unsigned i = 0; i < src.size(); i++)
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
}
} else if (cluster_size == 32) {
for (unsigned i = 0; i < src.size(); i++)

View File

@ -153,7 +153,7 @@ void setup_reduce_temp(Program* program)
instr->definitions[1] = bld.def(s2);
/* scalar identity temporary */
bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
bool need_sitmp = (program->chip_class <= GFX7 || program->chip_class >= GFX10) && instr->opcode != aco_opcode::p_reduce;
if (instr->opcode == aco_opcode::p_exclusive_scan) {
need_sitmp |=
(op == imin32 || op == imin64 || op == imax32 || op == imax64 ||