mesa/src/intel/compiler/brw_fs_lower_dpas.cpp

303 lines
10 KiB
C++

/*
* Copyright 2023 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_fs.h"
#include "brw_fs_builder.h"
using namespace brw;
static void
f16_using_mac(const fs_builder &bld, fs_inst *inst)
{
/* We only intend to support configurations where the destination and
* accumulator have the same type.
*/
if (!inst->src[0].is_null())
assert(inst->dst.type == inst->src[0].type);
assert(inst->src[1].type == BRW_TYPE_HF);
assert(inst->src[2].type == BRW_TYPE_HF);
const brw_reg_type src0_type = inst->dst.type;
const brw_reg_type src1_type = BRW_TYPE_HF;
const brw_reg_type src2_type = BRW_TYPE_HF;
const fs_reg dest = inst->dst;
fs_reg src0 = inst->src[0];
const fs_reg src1 = retype(inst->src[1], src1_type);
const fs_reg src2 = retype(inst->src[2], src2_type);
const unsigned dest_stride =
dest.type == BRW_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
for (unsigned r = 0; r < inst->rcount; r++) {
fs_reg temp = bld.vgrf(BRW_TYPE_HF);
for (unsigned subword = 0; subword < 2; subword++) {
for (unsigned s = 0; s < inst->sdepth; s++) {
/* The first multiply of the dot-product operation has to
* explicitly write the accumulator register. The successive MAC
* instructions will implicitly read *and* write the
* accumulator. Those MAC instructions can also optionally
* explicitly write some other register.
*
* FINISHME: The accumulator can actually hold 16 HF values. On
* Gfx12 there are two accumulators. It should be possible to do
* this in SIMD16 or even SIMD32. I was unable to get this to work
* properly.
*/
if (s == 0 && subword == 0) {
const unsigned acc_width = 8;
fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_TYPE_UD),
inst->group % acc_width);
if (bld.shader->devinfo->verx10 >= 125) {
acc = subscript(acc, BRW_TYPE_HF, subword);
} else {
acc = retype(acc, BRW_TYPE_HF);
}
bld.MUL(acc,
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_TYPE_UD),
BRW_TYPE_HF, subword),
component(retype(byte_offset(src2, r * REG_SIZE),
BRW_TYPE_HF),
s * 2 + subword))
->writes_accumulator = true;
} else {
fs_reg result;
/* As mentioned above, the MAC had an optional, explicit
* destination register. Various optimization passes are not
* clever enough to understand the intricacies of this
* instruction, so only write the result register on the final
* MAC in the sequence.
*/
if ((s + 1) == inst->sdepth && subword == 1)
result = temp;
else
result = retype(bld.null_reg_ud(), BRW_TYPE_HF);
bld.MAC(result,
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_TYPE_UD),
BRW_TYPE_HF, subword),
component(retype(byte_offset(src2, r * REG_SIZE),
BRW_TYPE_HF),
s * 2 + subword))
->writes_accumulator = true;
}
}
}
if (!src0.is_null()) {
if (src0_type != BRW_TYPE_HF) {
fs_reg temp2 = bld.vgrf(src0_type);
bld.MOV(temp2, temp);
bld.ADD(byte_offset(dest, r * dest_stride),
temp2,
byte_offset(src0, r * dest_stride));
} else {
bld.ADD(byte_offset(dest, r * dest_stride),
temp,
byte_offset(src0, r * dest_stride));
}
} else {
bld.MOV(byte_offset(dest, r * dest_stride), temp);
}
}
}
static void
int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
{
/* We only intend to support configurations where the destination and
* accumulator have the same type.
*/
if (!inst->src[0].is_null())
assert(inst->dst.type == inst->src[0].type);
assert(inst->src[1].type == BRW_TYPE_B ||
inst->src[1].type == BRW_TYPE_UB);
assert(inst->src[2].type == BRW_TYPE_B ||
inst->src[2].type == BRW_TYPE_UB);
const brw_reg_type src1_type = inst->src[1].type == BRW_TYPE_UB
? BRW_TYPE_UD : BRW_TYPE_D;
const brw_reg_type src2_type = inst->src[2].type == BRW_TYPE_UB
? BRW_TYPE_UD : BRW_TYPE_D;
fs_reg dest = inst->dst;
fs_reg src0 = inst->src[0];
const fs_reg src1 = retype(inst->src[1], src1_type);
const fs_reg src2 = retype(inst->src[2], src2_type);
const unsigned dest_stride = REG_SIZE;
for (unsigned r = 0; r < inst->rcount; r++) {
if (!src0.is_null()) {
bld.MOV(dest, src0);
src0 = byte_offset(src0, dest_stride);
} else {
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
}
for (unsigned s = 0; s < inst->sdepth; s++) {
bld.DP4A(dest,
dest,
byte_offset(src1, s * REG_SIZE),
component(byte_offset(src2, r * REG_SIZE), s))
->saturate = inst->saturate;
}
dest = byte_offset(dest, dest_stride);
}
}
static void
int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
{
/* We only intend to support configurations where the destination and
* accumulator have the same type.
*/
if (!inst->src[0].is_null())
assert(inst->dst.type == inst->src[0].type);
assert(inst->src[1].type == BRW_TYPE_B ||
inst->src[1].type == BRW_TYPE_UB);
assert(inst->src[2].type == BRW_TYPE_B ||
inst->src[2].type == BRW_TYPE_UB);
const brw_reg_type src0_type = inst->dst.type;
const brw_reg_type src1_type = inst->src[1].type == BRW_TYPE_UB
? BRW_TYPE_UD : BRW_TYPE_D;
const brw_reg_type src2_type = inst->src[2].type == BRW_TYPE_UB
? BRW_TYPE_UD : BRW_TYPE_D;
fs_reg dest = inst->dst;
fs_reg src0 = inst->src[0];
const fs_reg src1 = retype(inst->src[1], src1_type);
const fs_reg src2 = retype(inst->src[2], src2_type);
const unsigned dest_stride = REG_SIZE;
for (unsigned r = 0; r < inst->rcount; r++) {
if (!src0.is_null()) {
bld.MOV(dest, src0);
src0 = byte_offset(src0, dest_stride);
} else {
bld.MOV(dest, retype(brw_imm_d(0), dest.type));
}
for (unsigned s = 0; s < inst->sdepth; s++) {
fs_reg temp1 = bld.vgrf(BRW_TYPE_UD);
fs_reg temp2 = bld.vgrf(BRW_TYPE_UD);
fs_reg temp3 = bld.vgrf(BRW_TYPE_UD, 2);
const brw_reg_type temp_type =
(inst->src[1].type == BRW_TYPE_B ||
inst->src[2].type == BRW_TYPE_B)
? BRW_TYPE_W : BRW_TYPE_UW;
/* Expand 8 dwords of packed bytes into 16 dwords of packed
* words.
*
* FINISHME: Gfx9 should not need this work around. Gfx11
* may be able to use integer MAD. Both platforms may be
* able to use MAC.
*/
bld.group(32, 0).MOV(retype(temp3, temp_type),
retype(byte_offset(src2, r * REG_SIZE),
inst->src[2].type));
bld.MUL(subscript(temp1, temp_type, 0),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_TYPE_UD),
inst->src[1].type, 0),
subscript(component(retype(temp3, BRW_TYPE_UD),
s * 2),
temp_type, 0));
bld.MUL(subscript(temp1, temp_type, 1),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_TYPE_UD),
inst->src[1].type, 1),
subscript(component(retype(temp3, BRW_TYPE_UD),
s * 2),
temp_type, 1));
bld.MUL(subscript(temp2, temp_type, 0),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_TYPE_UD),
inst->src[1].type, 2),
subscript(component(retype(temp3, BRW_TYPE_UD),
s * 2 + 1),
temp_type, 0));
bld.MUL(subscript(temp2, temp_type, 1),
subscript(retype(byte_offset(src1, s * REG_SIZE),
BRW_TYPE_UD),
inst->src[1].type, 3),
subscript(component(retype(temp3, BRW_TYPE_UD),
s * 2 + 1),
temp_type, 1));
bld.ADD(subscript(temp1, src0_type, 0),
subscript(temp1, temp_type, 0),
subscript(temp1, temp_type, 1));
bld.ADD(subscript(temp2, src0_type, 0),
subscript(temp2, temp_type, 0),
subscript(temp2, temp_type, 1));
bld.ADD(retype(temp1, src0_type),
retype(temp1, src0_type),
retype(temp2, src0_type));
bld.ADD(dest, dest, retype(temp1, src0_type))
->saturate = inst->saturate;
}
dest = byte_offset(dest, dest_stride);
}
}
bool
brw_fs_lower_dpas(fs_visitor &v)
{
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
if (inst->opcode != BRW_OPCODE_DPAS)
continue;
const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
if (brw_type_is_float(inst->dst.type)) {
f16_using_mac(bld, inst);
} else {
if (v.devinfo->ver >= 12) {
int8_using_dp4a(bld, inst);
} else {
int8_using_mul_add(bld, inst);
}
}
inst->remove(block);
progress = true;
}
if (progress)
v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}