aco: don't use a scalar temporary for reductions on GFX10
This patch also adds the scalar temporary for scans on SI/CI Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
This commit is contained in:
parent
8ad43d8838
commit
9254fb4fc7
|
@ -481,8 +481,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
|
||||
if (cluster_size == 64) {
|
||||
for (unsigned i = 0; i < src.size(); i++)
|
||||
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
|
||||
bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
|
||||
}
|
||||
} else if (cluster_size == 32) {
|
||||
for (unsigned i = 0; i < src.size(); i++)
|
||||
|
|
|
@ -153,7 +153,7 @@ void setup_reduce_temp(Program* program)
|
|||
instr->definitions[1] = bld.def(s2);
|
||||
|
||||
/* scalar identity temporary */
|
||||
bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
|
||||
bool need_sitmp = (program->chip_class <= GFX7 || program->chip_class >= GFX10) && instr->opcode != aco_opcode::p_reduce;
|
||||
if (instr->opcode == aco_opcode::p_exclusive_scan) {
|
||||
need_sitmp |=
|
||||
(op == imin32 || op == imin64 || op == imax32 || op == imax64 ||
|
||||
|
|
Loading…
Reference in New Issue