i965/fs: Lower the MULH virtual instruction.

Translate MULH into the MUL/MACH sequence.  This does roughly the same
thing that nir_emit_alu() used to do but we can now handle 16-wide by
taking advantage of the SIMD lowering pass.  The force_sechalf
workaround near the bottom is required because the SIMD lowering pass
will emit instructions with non-zero quarter control and we need to
make sure we avoid that on integer arithmetic instructions with
implicit accumulator access due to a known hardware bug on IVB.

Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
Francisco Jerez 2015-08-06 14:04:00 +03:00
parent 2e73126438
commit 3b48a0eeda
1 changed files with 55 additions and 0 deletions

View File

@ -3280,6 +3280,55 @@ fs_visitor::lower_integer_multiplication()
ibld.MOV(null, inst->dst));
}
}
} else if (inst->opcode == SHADER_OPCODE_MULH) {
/* Should have been lowered to 8-wide. */
assert(inst->exec_size <= 8);
const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
inst->dst.type);
fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
if (devinfo->gen >= 8) {
/* Until Gen8, integer multiplies read 32-bits from one source,
* and 16-bits from the other, and relying on the MACH instruction
* to generate the high bits of the result.
*
* On Gen8, the multiply instruction does a full 32x32-bit
* multiply, but in order to do a 64-bit multiply we can simulate
* the previous behavior and then use a MACH instruction.
*
* FINISHME: Don't use source modifiers on src1.
*/
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
mul->src[1].type == BRW_REGISTER_TYPE_UD);
mul->src[1].type = (type_is_signed(mul->src[1].type) ?
BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
mul->src[1].stride *= 2;
} else if (devinfo->gen == 7 && !devinfo->is_haswell &&
inst->force_sechalf) {
/* Among other things the quarter control bits influence which
* accumulator register is used by the hardware for instructions
* that access the accumulator implicitly (e.g. MACH). A
* second-half instruction would normally map to acc1, which
* doesn't exist on Gen7 and up (the hardware does emulate it for
* floating-point instructions *only* by taking advantage of the
* extra precision of acc0 not normally used for floating point
* arithmetic).
*
* HSW and up are careful enough not to try to access an
* accumulator register that doesn't exist, but on earlier Gen7
* hardware we need to make sure that the quarter control bits are
* zero to avoid non-deterministic behaviour and emit an extra MOV
* to get the result masked correctly according to the current
* channel enables.
*/
mach->force_sechalf = false;
mach->force_writemask_all = true;
mach->dst = ibld.vgrf(inst->dst.type);
ibld.MOV(inst->dst, mach->dst);
}
} else {
continue;
}
@ -4083,6 +4132,12 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
const fs_inst *inst)
{
switch (inst->opcode) {
case SHADER_OPCODE_MULH:
/* MULH is lowered to the MUL/MACH sequence using the accumulator, which
* is 8-wide on Gen7+.
*/
return (devinfo->gen >= 7 ? 8 : inst->exec_size);
case FS_OPCODE_FB_WRITE_LOGICAL:
/* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
* here.