1655 lines
67 KiB
C++
1655 lines
67 KiB
C++
/*
|
|
* Copyright © 2020 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "brw_eu.h"
|
|
#include "brw_fs.h"
|
|
#include "brw_vec4.h"
|
|
#include "brw_cfg.h"
|
|
|
|
using namespace brw;
|
|
|
|
namespace {
|
|
/**
|
|
* Enumeration representing the various asynchronous units that can run
|
|
* computations in parallel on behalf of a shader thread.
|
|
*/
|
|
enum intel_eu_unit {
|
|
/** EU front-end. */
|
|
EU_UNIT_FE,
|
|
/** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
|
|
EU_UNIT_FPU,
|
|
/** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
|
|
EU_UNIT_EM,
|
|
/** Sampler shared function. */
|
|
EU_UNIT_SAMPLER,
|
|
/** Pixel Interpolator shared function. */
|
|
EU_UNIT_PI,
|
|
/** Unified Return Buffer shared function. */
|
|
EU_UNIT_URB,
|
|
/** Data Port Data Cache shared function. */
|
|
EU_UNIT_DP_DC,
|
|
/** Data Port Render Cache shared function. */
|
|
EU_UNIT_DP_RC,
|
|
/** Data Port Constant Cache shared function. */
|
|
EU_UNIT_DP_CC,
|
|
/** Message Gateway shared function. */
|
|
EU_UNIT_GATEWAY,
|
|
/** Thread Spawner shared function. */
|
|
EU_UNIT_SPAWNER,
|
|
/* EU_UNIT_VME, */
|
|
/* EU_UNIT_CRE, */
|
|
/** Number of asynchronous units currently tracked. */
|
|
EU_NUM_UNITS,
|
|
/** Dummy unit for instructions that don't consume runtime from the above. */
|
|
EU_UNIT_NULL = EU_NUM_UNITS
|
|
};
|
|
|
|
/**
|
|
* Enumeration representing a computation result another computation can
|
|
* potentially depend on.
|
|
*/
|
|
enum intel_eu_dependency_id {
|
|
/* Register part of the GRF. */
|
|
EU_DEPENDENCY_ID_GRF0 = 0,
|
|
/* Register part of the MRF. Only used on Gfx4-6. */
|
|
EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + BRW_MAX_GRF,
|
|
/* Address register part of the ARF. */
|
|
EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
|
|
/* Accumulator register part of the ARF. */
|
|
EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
|
|
/* Flag register part of the ARF. */
|
|
EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
|
|
/* SBID token write completion. Only used on Gfx12+. */
|
|
EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
|
|
/* SBID token read completion. Only used on Gfx12+. */
|
|
EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 16,
|
|
/* Number of computation dependencies currently tracked. */
|
|
EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 16
|
|
};
|
|
|
|
/**
|
|
* State of our modeling of the program execution.
|
|
*/
|
|
struct state {
|
|
state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
|
|
/**
|
|
* Time at which a given unit will be ready to execute the next
|
|
* computation, in clock units.
|
|
*/
|
|
unsigned unit_ready[EU_NUM_UNITS];
|
|
/**
|
|
* Time at which an instruction dependent on a given dependency ID will
|
|
* be ready to execute, in clock units.
|
|
*/
|
|
unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
|
|
/**
|
|
* Aggregated utilization of a given unit excluding idle cycles,
|
|
* in clock units.
|
|
*/
|
|
float unit_busy[EU_NUM_UNITS];
|
|
/**
|
|
* Factor of the overhead of a computation accounted for in the
|
|
* aggregated utilization calculation.
|
|
*/
|
|
float weight;
|
|
};
|
|
|
|
/**
|
|
* Information derived from an IR instruction used to compute performance
|
|
* estimates. Allows the timing calculation to work on both FS and VEC4
|
|
* instructions.
|
|
*/
|
|
struct instruction_info {
|
|
instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
|
|
isa(isa), devinfo(isa->devinfo), op(inst->opcode),
|
|
td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
|
|
tx(get_exec_type(inst)), sx(0), ss(0),
|
|
sc(has_bank_conflict(isa, inst) ? sd : 0),
|
|
desc(inst->desc), sfid(inst->sfid)
|
|
{
|
|
/* We typically want the maximum source size, except for split send
|
|
* messages which require the total size.
|
|
*/
|
|
if (inst->opcode == SHADER_OPCODE_SEND) {
|
|
ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
|
|
DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
|
|
} else {
|
|
for (unsigned i = 0; i < inst->sources; i++)
|
|
ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
|
|
}
|
|
|
|
/* Convert the execution size to GRF units. */
|
|
sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
|
|
|
|
/* 32x32 integer multiplication has half the usual ALU throughput.
|
|
* Treat it as double-precision.
|
|
*/
|
|
if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
|
|
!brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
|
|
type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
|
|
tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
|
|
}
|
|
|
|
instruction_info(const struct brw_isa_info *isa,
|
|
const vec4_instruction *inst) :
|
|
isa(isa), devinfo(isa->devinfo), op(inst->opcode),
|
|
td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
|
|
tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
|
|
desc(inst->desc), sfid(inst->sfid)
|
|
{
|
|
/* Compute the maximum source size. */
|
|
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
|
|
ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
|
|
|
|
/* Convert the execution size to GRF units. */
|
|
sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
|
|
|
|
/* 32x32 integer multiplication has half the usual ALU throughput.
|
|
* Treat it as double-precision.
|
|
*/
|
|
if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
|
|
!brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
|
|
type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
|
|
tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
|
|
}
|
|
|
|
/** ISA encoding information */
|
|
const struct brw_isa_info *isa;
|
|
/** Device information. */
|
|
const struct intel_device_info *devinfo;
|
|
/** Instruction opcode. */
|
|
opcode op;
|
|
/** Destination type. */
|
|
brw_reg_type td;
|
|
/** Destination size in GRF units. */
|
|
unsigned sd;
|
|
/** Execution type. */
|
|
brw_reg_type tx;
|
|
/** Execution size in GRF units. */
|
|
unsigned sx;
|
|
/** Source size. */
|
|
unsigned ss;
|
|
/** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
|
|
unsigned sc;
|
|
/** Send message descriptor. */
|
|
uint32_t desc;
|
|
/** Send message shared function ID. */
|
|
uint8_t sfid;
|
|
};
|
|
|
|
/**
|
|
* Timing information of an instruction used to estimate the performance of
|
|
* the program.
|
|
*/
|
|
struct perf_desc {
|
|
perf_desc(enum intel_eu_unit u, int df, int db,
|
|
int ls, int ld, int la, int lf) :
|
|
u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
|
|
|
|
/**
|
|
* Back-end unit its runtime shall be accounted to, in addition to the
|
|
* EU front-end which is always assumed to be involved.
|
|
*/
|
|
enum intel_eu_unit u;
|
|
/**
|
|
* Overhead cycles from the time that the EU front-end starts executing
|
|
* the instruction until it's ready to execute the next instruction.
|
|
*/
|
|
int df;
|
|
/**
|
|
* Overhead cycles from the time that the back-end starts executing the
|
|
* instruction until it's ready to execute the next instruction.
|
|
*/
|
|
int db;
|
|
/**
|
|
* Latency cycles from the time that the back-end starts executing the
|
|
* instruction until its sources have been read from the register file.
|
|
*/
|
|
int ls;
|
|
/**
|
|
* Latency cycles from the time that the back-end starts executing the
|
|
* instruction until its regular destination has been written to the
|
|
* register file.
|
|
*/
|
|
int ld;
|
|
/**
|
|
* Latency cycles from the time that the back-end starts executing the
|
|
* instruction until its accumulator destination has been written to the
|
|
* ARF file.
|
|
*
|
|
* Note that this is an approximation of the real behavior of
|
|
* accumulating instructions in the hardware: Instead of modeling a pair
|
|
* of back-to-back accumulating instructions as a first computation with
|
|
* latency equal to ld followed by another computation with a
|
|
* mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
|
|
* model the stall as if it occurred at the top of the pipeline, with
|
|
* the latency of the accumulator computation offset accordingly.
|
|
*/
|
|
int la;
|
|
/**
|
|
* Latency cycles from the time that the back-end starts executing the
|
|
* instruction until its flag destination has been written to the ARF
|
|
* file.
|
|
*/
|
|
int lf;
|
|
};
|
|
|
|
/**
|
|
* Compute the timing information of an instruction based on any relevant
|
|
* information from the IR and a number of parameters specifying a linear
|
|
* approximation: Parameter X_Y specifies the derivative of timing X
|
|
* relative to info field Y, while X_1 specifies the independent term of
|
|
* the approximation of timing X.
|
|
*/
|
|
perf_desc
|
|
calculate_desc(const instruction_info &info, enum intel_eu_unit u,
|
|
int df_1, int df_sd, int df_sc,
|
|
int db_1, int db_sx,
|
|
int ls_1, int ld_1, int la_1, int lf_1,
|
|
int l_ss, int l_sd)
|
|
{
|
|
return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
|
|
db_1 + db_sx * int(info.sx),
|
|
ls_1 + l_ss * int(info.ss),
|
|
ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
|
|
la_1, lf_1);
|
|
}
|
|
|
|
/**
|
|
* Compute the timing information of an instruction based on any relevant
|
|
* information from the IR and a number of linear approximation parameters
|
|
* hard-coded for each IR instruction.
|
|
*
|
|
* Most timing parameters are obtained from the multivariate linear
|
|
* regression of a sample of empirical timings measured using the tm0
|
|
* register (as can be done today by using the shader_time debugging
|
|
* option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
|
|
* "Shared Functions - Extended Math", Section 3.2 "Performance".
|
|
* Parameters marked XXX shall be considered low-quality, they're possibly
|
|
* high variance or completely guessed in cases where experimental data was
|
|
* unavailable.
|
|
*/
|
|
const perf_desc
|
|
instruction_desc(const instruction_info &info)
|
|
{
|
|
const struct intel_device_info *devinfo = info.devinfo;
|
|
|
|
switch (info.op) {
|
|
case BRW_OPCODE_SYNC:
|
|
case BRW_OPCODE_SEL:
|
|
case BRW_OPCODE_NOT:
|
|
case BRW_OPCODE_AND:
|
|
case BRW_OPCODE_OR:
|
|
case BRW_OPCODE_XOR:
|
|
case BRW_OPCODE_SHR:
|
|
case BRW_OPCODE_SHL:
|
|
case BRW_OPCODE_DIM:
|
|
case BRW_OPCODE_ASR:
|
|
case BRW_OPCODE_CMPN:
|
|
case BRW_OPCODE_F16TO32:
|
|
case BRW_OPCODE_BFREV:
|
|
case BRW_OPCODE_BFI1:
|
|
case BRW_OPCODE_AVG:
|
|
case BRW_OPCODE_FRC:
|
|
case BRW_OPCODE_RNDU:
|
|
case BRW_OPCODE_RNDD:
|
|
case BRW_OPCODE_RNDE:
|
|
case BRW_OPCODE_RNDZ:
|
|
case BRW_OPCODE_MAC:
|
|
case BRW_OPCODE_MACH:
|
|
case BRW_OPCODE_LZD:
|
|
case BRW_OPCODE_FBH:
|
|
case BRW_OPCODE_FBL:
|
|
case BRW_OPCODE_CBIT:
|
|
case BRW_OPCODE_ADDC:
|
|
case BRW_OPCODE_ROR:
|
|
case BRW_OPCODE_ROL:
|
|
case BRW_OPCODE_SUBB:
|
|
case BRW_OPCODE_SAD2:
|
|
case BRW_OPCODE_SADA2:
|
|
case BRW_OPCODE_LINE:
|
|
case BRW_OPCODE_NOP:
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
case SHADER_OPCODE_SCRATCH_HEADER:
|
|
case FS_OPCODE_DDX_COARSE:
|
|
case FS_OPCODE_DDX_FINE:
|
|
case FS_OPCODE_DDY_COARSE:
|
|
case FS_OPCODE_PIXEL_X:
|
|
case FS_OPCODE_PIXEL_Y:
|
|
case FS_OPCODE_SET_SAMPLE_ID:
|
|
case VEC4_OPCODE_MOV_BYTES:
|
|
case VEC4_OPCODE_UNPACK_UNIFORM:
|
|
case VEC4_OPCODE_DOUBLE_TO_F32:
|
|
case VEC4_OPCODE_DOUBLE_TO_D32:
|
|
case VEC4_OPCODE_DOUBLE_TO_U32:
|
|
case VEC4_OPCODE_TO_DOUBLE:
|
|
case VEC4_OPCODE_PICK_LOW_32BIT:
|
|
case VEC4_OPCODE_PICK_HIGH_32BIT:
|
|
case VEC4_OPCODE_SET_LOW_32BIT:
|
|
case VEC4_OPCODE_SET_HIGH_32BIT:
|
|
case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
|
|
case GS_OPCODE_SET_DWORD_2:
|
|
case GS_OPCODE_SET_WRITE_OFFSET:
|
|
case GS_OPCODE_SET_VERTEX_COUNT:
|
|
case GS_OPCODE_PREPARE_CHANNEL_MASKS:
|
|
case GS_OPCODE_SET_CHANNEL_MASKS:
|
|
case GS_OPCODE_GET_INSTANCE_ID:
|
|
case GS_OPCODE_SET_PRIMITIVE_ID:
|
|
case GS_OPCODE_SVB_SET_DST_INDEX:
|
|
case TCS_OPCODE_SRC0_010_IS_ZERO:
|
|
case TCS_OPCODE_GET_PRIMITIVE_ID:
|
|
case TES_OPCODE_GET_PRIMITIVE_ID:
|
|
case SHADER_OPCODE_READ_SR_REG:
|
|
if (devinfo->ver >= 11) {
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 10, 6 /* XXX */, 14, 0, 0);
|
|
} else if (devinfo->ver >= 8) {
|
|
if (type_sz(info.tx) > 4)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
|
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 8, 4, 12, 0, 0);
|
|
} else if (devinfo->verx10 >= 75) {
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 10, 6 /* XXX */, 16, 0, 0);
|
|
} else {
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 12, 8 /* XXX */, 18, 0, 0);
|
|
}
|
|
|
|
case BRW_OPCODE_MOV:
|
|
case BRW_OPCODE_CMP:
|
|
case BRW_OPCODE_ADD:
|
|
case BRW_OPCODE_ADD3:
|
|
case BRW_OPCODE_MUL:
|
|
case SHADER_OPCODE_MOV_RELOC_IMM:
|
|
case VEC4_OPCODE_MOV_FOR_SCRATCH:
|
|
if (devinfo->ver >= 11) {
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 10, 6, 14, 0, 0);
|
|
} else if (devinfo->ver >= 8) {
|
|
if (type_sz(info.tx) > 4)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
|
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 8, 4, 12, 0, 0);
|
|
} else if (devinfo->verx10 >= 75) {
|
|
if (info.tx == BRW_REGISTER_TYPE_F)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 12, 8 /* XXX */, 18, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 10, 6 /* XXX */, 16, 0, 0);
|
|
} else if (devinfo->ver >= 7) {
|
|
if (info.tx == BRW_REGISTER_TYPE_F)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 14, 10 /* XXX */, 20, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 12, 8 /* XXX */, 18, 0, 0);
|
|
} else {
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
|
|
0, 2 /* XXX */,
|
|
0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
|
|
0, 0);
|
|
}
|
|
|
|
case BRW_OPCODE_BFE:
|
|
case BRW_OPCODE_BFI2:
|
|
case BRW_OPCODE_CSEL:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case BRW_OPCODE_MAD:
|
|
if (devinfo->ver >= 11) {
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
|
} else if (devinfo->ver >= 8) {
|
|
if (type_sz(info.tx) > 4)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
|
|
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
|
} else if (devinfo->verx10 >= 75) {
|
|
if (info.tx == BRW_REGISTER_TYPE_F)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 12, 8 /* XXX */, 18, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 10, 6 /* XXX */, 16, 0, 0);
|
|
} else if (devinfo->ver >= 7) {
|
|
if (info.tx == BRW_REGISTER_TYPE_F)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 14, 10 /* XXX */, 20, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 12, 8 /* XXX */, 18, 0, 0);
|
|
} else if (devinfo->ver >= 6) {
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
|
|
0, 2 /* XXX */,
|
|
0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
|
|
0, 0);
|
|
} else {
|
|
abort();
|
|
}
|
|
|
|
case BRW_OPCODE_F32TO16:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
|
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
|
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
|
0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case BRW_OPCODE_DP4:
|
|
case BRW_OPCODE_DPH:
|
|
case BRW_OPCODE_DP3:
|
|
case BRW_OPCODE_DP2:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
|
|
case BRW_OPCODE_DP4A:
|
|
if (devinfo->ver >= 12)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
case SHADER_OPCODE_RSQ:
|
|
case SHADER_OPCODE_SQRT:
|
|
case SHADER_OPCODE_EXP2:
|
|
case SHADER_OPCODE_LOG2:
|
|
case SHADER_OPCODE_SIN:
|
|
case SHADER_OPCODE_COS:
|
|
case SHADER_OPCODE_POW:
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
if (devinfo->ver >= 6) {
|
|
switch (info.op) {
|
|
case SHADER_OPCODE_RCP:
|
|
case SHADER_OPCODE_RSQ:
|
|
case SHADER_OPCODE_SQRT:
|
|
case SHADER_OPCODE_EXP2:
|
|
case SHADER_OPCODE_LOG2:
|
|
case SHADER_OPCODE_SIN:
|
|
case SHADER_OPCODE_COS:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
|
|
0, 16, 0, 0, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
|
|
0, 12, 0, 0, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
|
|
0, 14, 0, 0, 0, 0);
|
|
|
|
case SHADER_OPCODE_POW:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
|
|
0, 24, 0, 0, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
|
|
0, 20, 0, 0, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
|
|
0, 22, 0, 0, 0, 0);
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
|
|
0, 28 /* XXX */, 0, 0, 0, 0);
|
|
|
|
default:
|
|
abort();
|
|
}
|
|
} else {
|
|
switch (info.op) {
|
|
case SHADER_OPCODE_RCP:
|
|
return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
|
|
0, 22, 0, 0, 0, 8);
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
|
|
0, 44, 0, 0, 0, 8);
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
case SHADER_OPCODE_SQRT:
|
|
case SHADER_OPCODE_LOG2:
|
|
return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
|
|
0, 66, 0, 0, 0, 8);
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
case SHADER_OPCODE_EXP2:
|
|
return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
|
|
0, 88, 0, 0, 0, 8);
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
case SHADER_OPCODE_COS:
|
|
return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
|
|
0, 132, 0, 0, 0, 8);
|
|
|
|
case SHADER_OPCODE_POW:
|
|
return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
|
|
0, 176, 0, 0, 0, 8);
|
|
|
|
default:
|
|
abort();
|
|
}
|
|
}
|
|
|
|
case BRW_OPCODE_DO:
|
|
if (devinfo->ver >= 6)
|
|
return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
|
|
case BRW_OPCODE_IF:
|
|
case BRW_OPCODE_ELSE:
|
|
case BRW_OPCODE_ENDIF:
|
|
case BRW_OPCODE_WHILE:
|
|
case BRW_OPCODE_BREAK:
|
|
case BRW_OPCODE_CONTINUE:
|
|
case BRW_OPCODE_HALT:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
|
|
case FS_OPCODE_LINTERP:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
|
|
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
|
|
case BRW_OPCODE_LRP:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
|
|
0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 6)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
|
|
0, 8 /* XXX */, 4 /* XXX */,
|
|
12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
16 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
|
|
0, 12 /* XXX */, 8 /* XXX */,
|
|
18 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
|
|
0, 8 /* XXX */, 4 /* XXX */,
|
|
12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
|
|
0, 12 /* XXX */, 8 /* XXX */,
|
|
18 /* XXX */, 0, 0);
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
|
|
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
|
|
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
|
|
0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
|
|
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
|
|
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
|
|
0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_RND_MODE:
|
|
case SHADER_OPCODE_FLOAT_CONTROL_MODE:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else if (devinfo->ver >= 6)
|
|
return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
|
|
44 /* XXX */, 0,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
|
|
42 /* XXX */, 0,
|
|
0, 8 /* XXX */, 4 /* XXX */,
|
|
12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
|
|
0, 44 /* XXX */,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
16 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 6)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
|
|
0, 46 /* XXX */,
|
|
0, 12 /* XXX */, 8 /* XXX */,
|
|
18 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
|
|
0, 4 /* XXX */,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
|
|
0, 4 /* XXX */,
|
|
0, 8 /* XXX */, 4 /* XXX */,
|
|
12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
|
|
0, 4 /* XXX */,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
|
|
0, 4 /* XXX */,
|
|
0, 12 /* XXX */, 8 /* XXX */,
|
|
18 /* XXX */, 0, 0);
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
|
|
0, 8 /* XXX */,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
|
|
0, 8 /* XXX */,
|
|
0, 8 /* XXX */, 4 /* XXX */,
|
|
12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
|
|
0, 8 /* XXX */,
|
|
0, 10 /* XXX */, 6 /* XXX */,
|
|
16 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
|
|
0, 8 /* XXX */,
|
|
0, 12 /* XXX */, 8 /* XXX */,
|
|
18 /* XXX */, 0, 0);
|
|
|
|
case FS_OPCODE_DDY_FINE:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
|
|
0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
|
|
0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
|
|
|
|
case FS_OPCODE_LOAD_LIVE_CHANNELS:
|
|
if (devinfo->ver >= 11)
|
|
return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
|
|
2 /* XXX */, 0,
|
|
0, 0, 0, 10 /* XXX */, 0, 0);
|
|
else if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
|
|
0, 2 /* XXX */,
|
|
0, 0, 0, 8 /* XXX */, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case VEC4_OPCODE_PACK_BYTES:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
|
|
0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
|
|
0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
|
|
0, 0);
|
|
|
|
case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
|
|
case TCS_OPCODE_GET_INSTANCE_ID:
|
|
case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
|
|
case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
|
|
case TES_OPCODE_CREATE_INPUT_READ_HEADER:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
|
|
6 /* XXX */, 0,
|
|
0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
|
|
0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
|
|
6 /* XXX */, 0,
|
|
0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
|
|
0, 0);
|
|
else
|
|
return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
|
|
6 /* XXX */, 0,
|
|
0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
|
|
0, 0);
|
|
|
|
case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
|
|
case TCS_OPCODE_CREATE_BARRIER_HEADER:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
|
|
8 /* XXX */, 0,
|
|
0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
|
|
0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
|
|
8 /* XXX */, 0,
|
|
0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
|
|
0, 0);
|
|
else if (devinfo->ver >= 6)
|
|
return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
|
|
8 /* XXX */, 0,
|
|
0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
|
|
0, 0);
|
|
else
|
|
abort();
|
|
|
|
case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
|
|
if (devinfo->ver >= 8)
|
|
return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
|
|
0, 0);
|
|
else if (devinfo->verx10 >= 75)
|
|
return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
|
|
0, 0);
|
|
else if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
|
|
4 /* XXX */, 0,
|
|
0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
|
|
0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_TEX:
|
|
case FS_OPCODE_TXB:
|
|
case SHADER_OPCODE_TXD:
|
|
case SHADER_OPCODE_TXF:
|
|
case SHADER_OPCODE_TXF_LZ:
|
|
case SHADER_OPCODE_TXL:
|
|
case SHADER_OPCODE_TXL_LZ:
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
case SHADER_OPCODE_TXF_CMS_W:
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
case SHADER_OPCODE_TXF_MCS:
|
|
case SHADER_OPCODE_TXS:
|
|
case SHADER_OPCODE_LOD:
|
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
|
case SHADER_OPCODE_TG4:
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
|
|
return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
|
|
8 /* XXX */, 750 /* XXX */, 0, 0,
|
|
2 /* XXX */, 0);
|
|
|
|
case VEC4_OPCODE_URB_READ:
|
|
case VEC4_VS_OPCODE_URB_WRITE:
|
|
case VEC4_GS_OPCODE_URB_WRITE:
|
|
case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
|
|
case GS_OPCODE_THREAD_END:
|
|
case GS_OPCODE_FF_SYNC:
|
|
case VEC4_TCS_OPCODE_URB_WRITE:
|
|
case TCS_OPCODE_RELEASE_INPUT:
|
|
case TCS_OPCODE_THREAD_END:
|
|
return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
|
|
32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
switch (info.sfid) {
|
|
case GFX6_SFID_DATAPORT_RENDER_CACHE:
|
|
if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
|
|
10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case BRW_SFID_URB:
|
|
case GFX7_SFID_DATAPORT_DATA_CACHE:
|
|
case GFX12_SFID_SLM:
|
|
case GFX12_SFID_TGM:
|
|
case GFX12_SFID_UGM:
|
|
case HSW_SFID_DATAPORT_DATA_CACHE_1:
|
|
if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
default:
|
|
abort();
|
|
}
|
|
|
|
case SHADER_OPCODE_GFX4_SCRATCH_READ:
|
|
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
|
|
case SHADER_OPCODE_GFX7_SCRATCH_READ:
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
|
|
|
|
case VEC4_OPCODE_UNTYPED_ATOMIC:
|
|
if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
30 /* XXX */, 400 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0,
|
|
0, 400 /* XXX */);
|
|
else
|
|
abort();
|
|
|
|
case VEC4_OPCODE_UNTYPED_SURFACE_READ:
|
|
case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
|
|
if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
0, 20 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0,
|
|
0, 0);
|
|
else
|
|
abort();
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
case FS_OPCODE_FB_READ:
|
|
case FS_OPCODE_REP_FB_WRITE:
|
|
return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
|
|
10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
|
|
|
|
case GS_OPCODE_SVB_WRITE:
|
|
if (devinfo->ver >= 6)
|
|
return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
|
|
0, 450 /* XXX */,
|
|
10 /* XXX */, 300 /* XXX */, 0, 0,
|
|
0, 0);
|
|
else
|
|
abort();
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
|
|
return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
|
|
|
|
case VS_OPCODE_PULL_CONSTANT_LOAD:
|
|
case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
|
|
return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
|
|
8, 750, 0, 0, 2, 0);
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
|
|
0, 90 /* XXX */, 0, 0, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
|
|
0 /* XXX */, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case CS_OPCODE_CS_TERMINATE:
|
|
if (devinfo->ver >= 7)
|
|
return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
|
|
10 /* XXX */, 0, 0, 0, 0, 0);
|
|
else
|
|
abort();
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
switch (info.sfid) {
|
|
case GFX6_SFID_DATAPORT_RENDER_CACHE:
|
|
if (devinfo->ver >= 7) {
|
|
switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
|
|
case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
|
|
return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
|
|
30 /* XXX */, 450 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */,
|
|
0, 0, 0, 400 /* XXX */);
|
|
default:
|
|
return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
|
|
0, 450 /* XXX */,
|
|
10 /* XXX */, 300 /* XXX */, 0, 0,
|
|
0, 0);
|
|
}
|
|
} else if (devinfo->ver >= 6) {
|
|
return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
|
|
0, 450 /* XXX */,
|
|
10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
|
|
} else {
|
|
abort();
|
|
}
|
|
case BRW_SFID_SAMPLER: {
|
|
if (devinfo->ver >= 6)
|
|
return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
|
|
8, 750, 0, 0, 2, 0);
|
|
else
|
|
abort();
|
|
}
|
|
case GFX7_SFID_DATAPORT_DATA_CACHE:
|
|
case HSW_SFID_DATAPORT_DATA_CACHE_1:
|
|
if (devinfo->verx10 >= 75) {
|
|
switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
|
|
case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
|
|
case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
|
|
case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
|
|
case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
30 /* XXX */, 400 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0,
|
|
0, 400 /* XXX */);
|
|
|
|
default:
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
0, 20 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0,
|
|
0, 0);
|
|
}
|
|
} else if (devinfo->ver >= 7) {
|
|
switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
|
|
case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
30 /* XXX */, 400 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */,
|
|
0, 0, 0, 400 /* XXX */);
|
|
default:
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
0, 20 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0,
|
|
0, 0);
|
|
}
|
|
} else {
|
|
abort();
|
|
}
|
|
|
|
case GFX12_SFID_UGM:
|
|
case GFX12_SFID_TGM:
|
|
case GFX12_SFID_SLM:
|
|
switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
|
|
case LSC_OP_LOAD:
|
|
case LSC_OP_STORE:
|
|
case LSC_OP_LOAD_CMASK:
|
|
case LSC_OP_STORE_CMASK:
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
0, 20 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0,
|
|
0, 0);
|
|
|
|
case LSC_OP_FENCE:
|
|
case LSC_OP_ATOMIC_INC:
|
|
case LSC_OP_ATOMIC_DEC:
|
|
case LSC_OP_ATOMIC_LOAD:
|
|
case LSC_OP_ATOMIC_STORE:
|
|
case LSC_OP_ATOMIC_ADD:
|
|
case LSC_OP_ATOMIC_SUB:
|
|
case LSC_OP_ATOMIC_MIN:
|
|
case LSC_OP_ATOMIC_MAX:
|
|
case LSC_OP_ATOMIC_UMIN:
|
|
case LSC_OP_ATOMIC_UMAX:
|
|
case LSC_OP_ATOMIC_CMPXCHG:
|
|
case LSC_OP_ATOMIC_FADD:
|
|
case LSC_OP_ATOMIC_FSUB:
|
|
case LSC_OP_ATOMIC_FMIN:
|
|
case LSC_OP_ATOMIC_FMAX:
|
|
case LSC_OP_ATOMIC_FCMPXCHG:
|
|
case LSC_OP_ATOMIC_AND:
|
|
case LSC_OP_ATOMIC_OR:
|
|
case LSC_OP_ATOMIC_XOR:
|
|
return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
|
|
30 /* XXX */, 400 /* XXX */,
|
|
10 /* XXX */, 100 /* XXX */, 0, 0,
|
|
0, 400 /* XXX */);
|
|
default:
|
|
abort();
|
|
}
|
|
|
|
case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
|
|
case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
|
|
return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
|
|
10 /* XXX */, 0, 0, 0, 0, 0);
|
|
|
|
case BRW_SFID_URB:
|
|
return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
|
|
32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
|
|
|
|
default:
|
|
abort();
|
|
}
|
|
|
|
case SHADER_OPCODE_UNDEF:
|
|
case SHADER_OPCODE_HALT_TARGET:
|
|
case FS_OPCODE_SCHEDULING_FENCE:
|
|
return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0);
|
|
|
|
default:
|
|
abort();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Model the performance behavior of a stall on the specified dependency
|
|
* ID.
|
|
*/
|
|
void
|
|
stall_on_dependency(state &st, enum intel_eu_dependency_id id)
|
|
{
|
|
if (id < ARRAY_SIZE(st.dep_ready))
|
|
st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
|
|
st.dep_ready[id]);
|
|
}
|
|
|
|
/**
|
|
* Model the performance behavior of the front-end and back-end while
|
|
* executing an instruction with the specified timing information, assuming
|
|
* all dependencies are already clear.
|
|
*/
|
|
void
|
|
execute_instruction(state &st, const perf_desc &perf)
|
|
{
|
|
/* Compute the time at which the front-end will be ready to execute the
|
|
* next instruction.
|
|
*/
|
|
st.unit_ready[EU_UNIT_FE] += perf.df;
|
|
|
|
if (perf.u < EU_NUM_UNITS) {
|
|
/* Wait for the back-end to be ready to execute this instruction. */
|
|
st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
|
|
st.unit_ready[perf.u]);
|
|
|
|
/* Compute the time at which the back-end will be ready to execute
|
|
* the next instruction, and update the back-end utilization.
|
|
*/
|
|
st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
|
|
st.unit_busy[perf.u] += perf.db * st.weight;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Model the performance behavior of a read dependency provided by an
|
|
* instruction.
|
|
*/
|
|
void
|
|
mark_read_dependency(state &st, const perf_desc &perf,
|
|
enum intel_eu_dependency_id id)
|
|
{
|
|
if (id < ARRAY_SIZE(st.dep_ready))
|
|
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
|
|
}
|
|
|
|
/**
|
|
* Model the performance behavior of a write dependency provided by an
|
|
* instruction.
|
|
*/
|
|
void
|
|
mark_write_dependency(state &st, const perf_desc &perf,
|
|
enum intel_eu_dependency_id id)
|
|
{
|
|
if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
|
|
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
|
|
else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
|
|
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
|
|
else if (id < ARRAY_SIZE(st.dep_ready))
|
|
st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
|
|
}
|
|
|
|
/**
|
|
* Return the dependency ID of a backend_reg, offset by \p delta GRFs.
|
|
*/
|
|
enum intel_eu_dependency_id
|
|
reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
|
|
const int delta)
|
|
{
|
|
if (r.file == VGRF) {
|
|
const unsigned i = r.nr + r.offset / REG_SIZE + delta;
|
|
assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
|
|
|
|
} else if (r.file == FIXED_GRF) {
|
|
const unsigned i = r.nr + delta;
|
|
assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
|
|
|
|
} else if (r.file == MRF && devinfo->ver >= 7) {
|
|
const unsigned i = GFX7_MRF_HACK_START +
|
|
r.nr + r.offset / REG_SIZE + delta;
|
|
assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
|
|
|
|
} else if (r.file == MRF && devinfo->ver < 7) {
|
|
const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
|
|
r.offset / REG_SIZE + delta;
|
|
assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
|
|
|
|
} else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
|
|
r.nr < BRW_ARF_ACCUMULATOR) {
|
|
assert(delta == 0);
|
|
return EU_DEPENDENCY_ID_ADDR0;
|
|
|
|
} else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
|
|
r.nr < BRW_ARF_FLAG) {
|
|
const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
|
|
assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
|
|
|
|
} else {
|
|
return EU_NUM_DEPENDENCY_IDS;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return the dependency ID of flag register starting at offset \p i.
|
|
*/
|
|
enum intel_eu_dependency_id
|
|
flag_dependency_id(unsigned i)
|
|
{
|
|
assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
|
|
}
|
|
|
|
/**
|
|
* Return the dependency ID corresponding to the SBID read completion
|
|
* condition of a Gfx12+ SWSB.
|
|
*/
|
|
enum intel_eu_dependency_id
|
|
tgl_swsb_rd_dependency_id(tgl_swsb swsb)
|
|
{
|
|
if (swsb.mode) {
|
|
assert(swsb.sbid <
|
|
EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
|
|
} else {
|
|
return EU_NUM_DEPENDENCY_IDS;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return the dependency ID corresponding to the SBID write completion
|
|
* condition of a Gfx12+ SWSB.
|
|
*/
|
|
enum intel_eu_dependency_id
|
|
tgl_swsb_wr_dependency_id(tgl_swsb swsb)
|
|
{
|
|
if (swsb.mode) {
|
|
assert(swsb.sbid <
|
|
EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
|
|
return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
|
|
} else {
|
|
return EU_NUM_DEPENDENCY_IDS;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return the implicit accumulator register accessed by channel \p i of the
|
|
* instruction.
|
|
*/
|
|
unsigned
|
|
accum_reg_of_channel(const intel_device_info *devinfo,
|
|
const backend_instruction *inst,
|
|
brw_reg_type tx, unsigned i)
|
|
{
|
|
assert(inst->reads_accumulator_implicitly() ||
|
|
inst->writes_accumulator_implicitly(devinfo));
|
|
const unsigned offset = (inst->group + i) * type_sz(tx) *
|
|
(devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
|
|
return offset / REG_SIZE % 2;
|
|
}
|
|
|
|
/**
|
|
* Model the performance behavior of an FS back-end instruction.
|
|
*/
|
|
void
|
|
issue_fs_inst(state &st, const struct brw_isa_info *isa,
|
|
const backend_instruction *be_inst)
|
|
{
|
|
const struct intel_device_info *devinfo = isa->devinfo;
|
|
const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
|
|
const instruction_info info(isa, inst);
|
|
const perf_desc perf = instruction_desc(info);
|
|
|
|
/* Stall on any source dependencies. */
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
for (unsigned j = 0; j < regs_read(inst, i); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, inst->src[i], j));
|
|
}
|
|
|
|
if (inst->reads_accumulator_implicitly()) {
|
|
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
|
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
|
inst->exec_size - 1); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
|
}
|
|
|
|
if (is_send(inst) && inst->base_mrf != -1) {
|
|
for (unsigned j = 0; j < inst->mlen; j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(
|
|
devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
|
|
}
|
|
|
|
if (const unsigned mask = inst->flags_read(devinfo)) {
|
|
for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
|
|
if (mask & (1 << i))
|
|
stall_on_dependency(st, flag_dependency_id(i));
|
|
}
|
|
}
|
|
|
|
/* Stall on any write dependencies. */
|
|
if (!inst->no_dd_check) {
|
|
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
|
|
for (unsigned j = 0; j < regs_written(inst); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, inst->dst, j));
|
|
}
|
|
|
|
if (inst->writes_accumulator_implicitly(devinfo)) {
|
|
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
|
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
|
inst->exec_size - 1); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
|
}
|
|
|
|
if (const unsigned mask = inst->flags_written(devinfo)) {
|
|
for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
|
|
if (mask & (1 << i))
|
|
stall_on_dependency(st, flag_dependency_id(i));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Stall on any SBID dependencies. */
|
|
if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
|
|
stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
|
|
else if (inst->sched.mode & TGL_SBID_SRC)
|
|
stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
|
|
|
|
/* Execute the instruction. */
|
|
execute_instruction(st, perf);
|
|
|
|
/* Mark any source dependencies. */
|
|
if (inst->is_send_from_grf()) {
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
if (inst->is_payload(i)) {
|
|
for (unsigned j = 0; j < regs_read(inst, i); j++)
|
|
mark_read_dependency(
|
|
st, perf, reg_dependency_id(devinfo, inst->src[i], j));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (is_send(inst) && inst->base_mrf != -1) {
|
|
for (unsigned j = 0; j < inst->mlen; j++)
|
|
mark_read_dependency(st, perf,
|
|
reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
|
|
}
|
|
|
|
/* Mark any destination dependencies. */
|
|
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
|
|
for (unsigned j = 0; j < regs_written(inst); j++) {
|
|
mark_write_dependency(st, perf,
|
|
reg_dependency_id(devinfo, inst->dst, j));
|
|
}
|
|
}
|
|
|
|
if (inst->writes_accumulator_implicitly(devinfo)) {
|
|
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
|
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
|
inst->exec_size - 1); j++)
|
|
mark_write_dependency(st, perf,
|
|
reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
|
}
|
|
|
|
if (const unsigned mask = inst->flags_written(devinfo)) {
|
|
for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
|
|
if (mask & (1 << i))
|
|
mark_write_dependency(st, perf, flag_dependency_id(i));
|
|
}
|
|
}
|
|
|
|
/* Mark any SBID dependencies. */
|
|
if (inst->sched.mode & TGL_SBID_SET) {
|
|
mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
|
|
mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Model the performance behavior of a VEC4 back-end instruction.
|
|
*/
|
|
void
|
|
issue_vec4_instruction(state &st, const struct brw_isa_info *isa,
|
|
const backend_instruction *be_inst)
|
|
{
|
|
const struct intel_device_info *devinfo = isa->devinfo;
|
|
const vec4_instruction *inst =
|
|
static_cast<const vec4_instruction *>(be_inst);
|
|
const instruction_info info(isa, inst);
|
|
const perf_desc perf = instruction_desc(info);
|
|
|
|
/* Stall on any source dependencies. */
|
|
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
|
|
for (unsigned j = 0; j < regs_read(inst, i); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, inst->src[i], j));
|
|
}
|
|
|
|
if (inst->reads_accumulator_implicitly()) {
|
|
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
|
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
|
inst->exec_size - 1); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
|
}
|
|
|
|
if (inst->base_mrf != -1) {
|
|
for (unsigned j = 0; j < inst->mlen; j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(
|
|
devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
|
|
}
|
|
|
|
if (inst->reads_flag())
|
|
stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
|
|
|
|
/* Stall on any write dependencies. */
|
|
if (!inst->no_dd_check) {
|
|
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
|
|
for (unsigned j = 0; j < regs_written(inst); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, inst->dst, j));
|
|
}
|
|
|
|
if (inst->writes_accumulator_implicitly(devinfo)) {
|
|
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
|
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
|
inst->exec_size - 1); j++)
|
|
stall_on_dependency(
|
|
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
|
}
|
|
|
|
if (inst->writes_flag(devinfo))
|
|
stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
|
|
}
|
|
|
|
/* Execute the instruction. */
|
|
execute_instruction(st, perf);
|
|
|
|
/* Mark any source dependencies. */
|
|
if (inst->is_send_from_grf()) {
|
|
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
|
|
for (unsigned j = 0; j < regs_read(inst, i); j++)
|
|
mark_read_dependency(
|
|
st, perf, reg_dependency_id(devinfo, inst->src[i], j));
|
|
}
|
|
}
|
|
|
|
if (inst->base_mrf != -1) {
|
|
for (unsigned j = 0; j < inst->mlen; j++)
|
|
mark_read_dependency(st, perf,
|
|
reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
|
|
}
|
|
|
|
/* Mark any destination dependencies. */
|
|
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
|
|
for (unsigned j = 0; j < regs_written(inst); j++) {
|
|
mark_write_dependency(st, perf,
|
|
reg_dependency_id(devinfo, inst->dst, j));
|
|
}
|
|
}
|
|
|
|
if (inst->writes_accumulator_implicitly(devinfo)) {
|
|
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
|
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
|
inst->exec_size - 1); j++)
|
|
mark_write_dependency(st, perf,
|
|
reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
|
}
|
|
|
|
if (inst->writes_flag(devinfo))
|
|
mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
|
|
}
|
|
|
|
/**
|
|
* Calculate the maximum possible throughput of the program compatible with
|
|
* the cycle-count utilization estimated for each asynchronous unit, in
|
|
* threads-per-cycle units.
|
|
*/
|
|
float
|
|
calculate_thread_throughput(const state &st, float busy)
|
|
{
|
|
for (unsigned i = 0; i < EU_NUM_UNITS; i++)
|
|
busy = MAX2(busy, st.unit_busy[i]);
|
|
|
|
return 1.0 / busy;
|
|
}
|
|
|
|
/**
|
|
* Estimate the performance of the specified shader.
|
|
*/
|
|
void
|
|
calculate_performance(performance &p, const backend_shader *s,
|
|
void (*issue_instruction)(
|
|
state &, const struct brw_isa_info *,
|
|
const backend_instruction *),
|
|
unsigned dispatch_width)
|
|
{
|
|
/* XXX - Note that the previous version of this code used worst-case
|
|
* scenario estimation of branching divergence for SIMD32 shaders,
|
|
* but this heuristic was removed to improve performance in common
|
|
* scenarios. Wider shader variants are less optimal when divergence
|
|
* is high, e.g. when application renders complex scene on a small
|
|
* surface. It is assumed that such renders are short, so their
|
|
* time doesn't matter and when it comes to the overall performance,
|
|
* they are dominated by more optimal larger renders.
|
|
*
|
|
* It's possible that we could do better with divergence analysis
|
|
* by isolating branches which are 100% uniform.
|
|
*
|
|
* Plumbing the trip counts from NIR loop analysis would allow us
|
|
* to do a better job regarding the loop weights.
|
|
*
|
|
* In the meantime use values that roughly match the control flow
|
|
* weights used elsewhere in the compiler back-end.
|
|
*
|
|
* Note that we provide slightly more pessimistic weights on
|
|
* Gfx12+ for SIMD32, since the effective warp size on that
|
|
* platform is 2x the SIMD width due to EU fusion, which increases
|
|
* the likelihood of divergent control flow in comparison to
|
|
* previous generations, giving narrower SIMD modes a performance
|
|
* advantage in several test-cases with non-uniform discard jumps.
|
|
*/
|
|
const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
|
|
1.0 : 0.5);
|
|
const float loop_weight = 10;
|
|
unsigned halt_count = 0;
|
|
unsigned elapsed = 0;
|
|
state st;
|
|
|
|
foreach_block(block, s->cfg) {
|
|
const unsigned elapsed0 = elapsed;
|
|
|
|
foreach_inst_in_block(backend_instruction, inst, block) {
|
|
const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
|
|
|
|
issue_instruction(st, &s->compiler->isa, inst);
|
|
|
|
if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
|
|
st.weight /= discard_weight;
|
|
|
|
elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
|
|
|
|
if (inst->opcode == BRW_OPCODE_DO)
|
|
st.weight *= loop_weight;
|
|
else if (inst->opcode == BRW_OPCODE_WHILE)
|
|
st.weight /= loop_weight;
|
|
else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
|
|
st.weight *= discard_weight;
|
|
}
|
|
|
|
p.block_latency[block->num] = elapsed - elapsed0;
|
|
}
|
|
|
|
p.latency = elapsed;
|
|
p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
|
|
}
|
|
}
|
|
|
|
brw::performance::performance(const fs_visitor *v) :
|
|
block_latency(new unsigned[v->cfg->num_blocks])
|
|
{
|
|
calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
|
|
}
|
|
|
|
brw::performance::performance(const vec4_visitor *v) :
|
|
block_latency(new unsigned[v->cfg->num_blocks])
|
|
{
|
|
calculate_performance(*this, v, issue_vec4_instruction, 8);
|
|
}
|
|
|
|
brw::performance::~performance()
|
|
{
|
|
delete[] block_latency;
|
|
}
|