diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 031b88745ae..16bab05ea00 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1603,7 +1603,8 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) break; case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break; case aco_opcode::v_mul_f16: - case aco_opcode::v_mul_f32: { /* omod */ + case aco_opcode::v_mul_f32: + case aco_opcode::v_mul_legacy_f32: { /* omod */ ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); /* TODO: try to move the negate/abs modifier to the consumer instead */ @@ -1645,8 +1646,9 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */ ctx.info[instr->operands[i].tempId()].set_omod5(instr.get()); } else if (instr->operands[!i].constantValue() == 0u && - !(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64 - : ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */ + (!(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64 + : ctx.fp_mode.preserve_signed_zero_inf_nan32) || + instr->opcode == aco_opcode::v_mul_legacy_f32)) { /* 0.0 */ ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u); } else { continue; @@ -3496,6 +3498,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) return; if (mul_instr->isSDWA() || mul_instr->isDPP()) return; + if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32 && + ctx.fp_mode.preserve_signed_zero_inf_nan32) + return; /* convert to mul(neg(a), b) */ ctx.uses[mul_instr->definitions[0].tempId()]--; @@ -3554,6 +3559,10 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (info.instr->isVOP3() && (info.instr->vop3().clamp || info.instr->vop3().omod)) continue; + bool legacy = info.instr->opcode == aco_opcode::v_mul_legacy_f32; + if (legacy && need_fma && ctx.program->chip_class < GFX10_3) + continue; + Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]}; if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) || ctx.uses[instr->operands[i].tempId()] > uses) @@ -3619,13 +3628,17 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; - if (mad16) + if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32) { + assert(need_fma == (ctx.program->chip_class >= GFX10_3)); + mad_op = need_fma ? aco_opcode::v_fma_legacy_f32 : aco_opcode::v_mad_legacy_f32; + } else if (mad16) { mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) : (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16); - if (mad64) + } else if (mad64) { mad_op = aco_opcode::v_fma_f64; + } aco_ptr mad{ create_instruction(mad_op, Format::VOP3, 3, 1)}; @@ -3646,7 +3659,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) } } /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */ - else if (instr->opcode == aco_opcode::v_mul_f32 && !ctx.fp_mode.preserve_signed_zero_inf_nan32 && + else if (((instr->opcode == aco_opcode::v_mul_f32 && + !ctx.fp_mode.preserve_signed_zero_inf_nan32) || + instr->opcode == aco_opcode::v_mul_legacy_f32) && !instr->usesModifiers() && !ctx.fp_mode.must_flush_denorms32) { for (unsigned i = 0; i < 2; i++) { if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() && @@ -3904,7 +3919,9 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) mad_info = NULL; } /* check literals */ - else if (!instr->usesModifiers() && instr->opcode != aco_opcode::v_fma_f64) { + else if (!instr->usesModifiers() && instr->opcode != aco_opcode::v_fma_f64 && + instr->opcode != aco_opcode::v_mad_legacy_f32 && + instr->opcode != aco_opcode::v_fma_legacy_f32) { /* FMA can only take literals on GFX10+ */ if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) && ctx.program->chip_class < GFX10)