mesa/src/panfrost/bifrost/compiler.h

1403 lines
41 KiB
C
Raw Normal View History

/*
* Copyright (C) 2020 Collabora Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors (Collabora):
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
*/
#ifndef __BIFROST_COMPILER_H
#define __BIFROST_COMPILER_H
#include "bifrost.h"
#include "bi_opcodes.h"
#include "compiler/nir/nir.h"
#include "panfrost/util/pan_ir.h"
#include "util/u_math.h"
#include "util/half_float.h"
#include "util/u_worklist.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Swizzles across bytes in a 32-bit word. Expresses swz in the XML directly.
* To express widen, use the correpsonding replicated form, i.e. H01 = identity
* for widen = none, H00 for widen = h0, B1111 for widen = b1. For lane, also
* use the replicated form (interpretation is governed by the opcode). For
* 8-bit lanes with two channels, use replicated forms for replicated forms
* (TODO: what about others?). For 8-bit lanes with four channels using
* matching form (TODO: what about others?).
*/
enum bi_swizzle {
/* 16-bit swizzle ordering deliberate for fast compute */
BI_SWIZZLE_H00 = 0, /* = B0101 */
BI_SWIZZLE_H01 = 1, /* = B0123 = W0 */
BI_SWIZZLE_H10 = 2, /* = B2301 */
BI_SWIZZLE_H11 = 3, /* = B2323 */
/* replication order should be maintained for fast compute */
BI_SWIZZLE_B0000 = 4, /* single channel (replicate) */
BI_SWIZZLE_B1111 = 5,
BI_SWIZZLE_B2222 = 6,
BI_SWIZZLE_B3333 = 7,
/* totally special for explicit pattern matching */
BI_SWIZZLE_B0011 = 8, /* +SWZ.v4i8 */
BI_SWIZZLE_B2233 = 9, /* +SWZ.v4i8 */
BI_SWIZZLE_B1032 = 10, /* +SWZ.v4i8 */
BI_SWIZZLE_B3210 = 11, /* +SWZ.v4i8 */
BI_SWIZZLE_B0022 = 12, /* for b02 lanes */
};
/* Given a packed i16vec2/i8vec4 constant, apply a swizzle. Useful for constant
* folding and Valhall constant optimization. */
static inline uint32_t
bi_apply_swizzle(uint32_t value, enum bi_swizzle swz)
{
const uint16_t *h = (const uint16_t *) &value;
const uint8_t *b = (const uint8_t *) &value;
#define H(h0, h1) (h[h0] | (h[h1] << 16))
#define B(b0, b1, b2, b3) (b[b0] | (b[b1] << 8) | (b[b2] << 16) | (b[b3] << 24))
switch (swz) {
case BI_SWIZZLE_H00: return H(0, 0);
case BI_SWIZZLE_H01: return H(0, 1);
case BI_SWIZZLE_H10: return H(1, 0);
case BI_SWIZZLE_H11: return H(1, 1);
case BI_SWIZZLE_B0000: return B(0, 0, 0, 0);
case BI_SWIZZLE_B1111: return B(1, 1, 1, 1);
case BI_SWIZZLE_B2222: return B(2, 2, 2, 2);
case BI_SWIZZLE_B3333: return B(3, 3, 3, 3);
case BI_SWIZZLE_B0011: return B(0, 0, 1, 1);
case BI_SWIZZLE_B2233: return B(2, 2, 3, 3);
case BI_SWIZZLE_B1032: return B(1, 0, 3, 2);
case BI_SWIZZLE_B3210: return B(3, 2, 1, 0);
case BI_SWIZZLE_B0022: return B(0, 0, 2, 2);
}
#undef H
#undef B
unreachable("Invalid swizzle");
}
enum bi_index_type {
BI_INDEX_NULL = 0,
BI_INDEX_NORMAL = 1,
BI_INDEX_REGISTER = 2,
BI_INDEX_CONSTANT = 3,
BI_INDEX_PASS = 4,
BI_INDEX_FAU = 5
};
typedef struct {
uint32_t value;
/* modifiers, should only be set if applicable for a given instruction.
* For *IDP.v4i8, abs plays the role of sign. For bitwise ops where
* applicable, neg plays the role of not */
bool abs : 1;
bool neg : 1;
/* The last use of a value, should be purged from the register cache.
* Set by liveness analysis. */
bool discard : 1;
/* For a source, the swizzle. For a destination, acts a bit like a
* write mask. Identity for the full 32-bit, H00 for only caring about
* the lower half, other values unused. */
enum bi_swizzle swizzle : 4;
uint32_t offset : 3;
bool reg : 1;
enum bi_index_type type : 3;
/* Must be zeroed so we can hash the whole 64-bits at a time */
unsigned padding : (32 - 14);
} bi_index;
static inline bi_index
bi_get_index(unsigned value, bool is_reg, unsigned offset)
{
return (bi_index) {
.value = value,
.swizzle = BI_SWIZZLE_H01,
.offset = offset,
.reg = is_reg,
.type = BI_INDEX_NORMAL,
};
}
static inline bi_index
bi_register(unsigned reg)
{
assert(reg < 64);
return (bi_index) {
.value = reg,
.swizzle = BI_SWIZZLE_H01,
.type = BI_INDEX_REGISTER,
};
}
static inline bi_index
bi_imm_u32(uint32_t imm)
{
return (bi_index) {
.value = imm,
.swizzle = BI_SWIZZLE_H01,
.type = BI_INDEX_CONSTANT,
};
}
static inline bi_index
bi_imm_f32(float imm)
{
return bi_imm_u32(fui(imm));
}
static inline bi_index
bi_null()
{
return (bi_index) { .type = BI_INDEX_NULL };
}
static inline bi_index
bi_zero()
{
return bi_imm_u32(0);
}
static inline bi_index
bi_passthrough(enum bifrost_packed_src value)
{
return (bi_index) {
.value = value,
.swizzle = BI_SWIZZLE_H01,
.type = BI_INDEX_PASS,
};
}
/* Helps construct swizzles */
static inline bi_index
bi_swz_16(bi_index idx, bool x, bool y)
{
assert(idx.swizzle == BI_SWIZZLE_H01);
idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_H00 | (x << 1) | y);
return idx;
}
static inline bi_index
bi_half(bi_index idx, bool upper)
{
return bi_swz_16(idx, upper, upper);
}
static inline bi_index
bi_byte(bi_index idx, unsigned lane)
{
assert(idx.swizzle == BI_SWIZZLE_H01);
assert(lane < 4);
idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_B0000 + lane);
return idx;
}
static inline bi_index
bi_abs(bi_index idx)
{
idx.abs = true;
return idx;
}
static inline bi_index
bi_neg(bi_index idx)
{
idx.neg ^= true;
return idx;
}
static inline bi_index
bi_discard(bi_index idx)
{
idx.discard = true;
return idx;
}
/* Additive identity in IEEE 754 arithmetic */
static inline bi_index
bi_negzero()
{
return bi_neg(bi_zero());
}
/* Replaces an index, preserving any modifiers */
static inline bi_index
bi_replace_index(bi_index old, bi_index replacement)
{
replacement.abs = old.abs;
replacement.neg = old.neg;
replacement.swizzle = old.swizzle;
replacement.discard = false; /* needs liveness analysis to set */
return replacement;
}
/* Remove any modifiers. This has the property:
*
* replace_index(x, strip_index(x)) = x
*
* This ensures it is suitable to use when lowering sources to moves */
static inline bi_index
bi_strip_index(bi_index index)
{
index.abs = index.neg = false;
index.swizzle = BI_SWIZZLE_H01;
return index;
}
/* For bitwise instructions */
#define bi_not(x) bi_neg(x)
static inline bi_index
bi_imm_u8(uint8_t imm)
{
return bi_byte(bi_imm_u32(imm), 0);
}
static inline bi_index
bi_imm_u16(uint16_t imm)
{
return bi_half(bi_imm_u32(imm), false);
}
static inline bi_index
bi_imm_uintN(uint32_t imm, unsigned sz)
{
assert(sz == 8 || sz == 16 || sz == 32);
return (sz == 8) ? bi_imm_u8(imm) :
(sz == 16) ? bi_imm_u16(imm) :
bi_imm_u32(imm);
}
static inline bi_index
bi_imm_f16(float imm)
{
return bi_imm_u16(_mesa_float_to_half(imm));
}
static inline bool
bi_is_null(bi_index idx)
{
return idx.type == BI_INDEX_NULL;
}
static inline bool
bi_is_ssa(bi_index idx)
{
return idx.type == BI_INDEX_NORMAL && !idx.reg;
}
/* Compares equivalence as references. Does not compare offsets, swizzles, or
* modifiers. In other words, this forms bi_index equivalence classes by
* partitioning memory. E.g. -abs(foo[1].yx) == foo.xy but foo != bar */
static inline bool
bi_is_equiv(bi_index left, bi_index right)
{
return (left.type == right.type) &&
(left.reg == right.reg) &&
(left.value == right.value);
}
/* A stronger equivalence relation that requires the indices access the
* same offset, useful for RA/scheduling to see what registers will
* correspond to */
static inline bool
bi_is_word_equiv(bi_index left, bi_index right)
{
return bi_is_equiv(left, right) && left.offset == right.offset;
}
pan/bi: Optimize replication Bifrost's 16-bit support comes in the form of vectorized instructions, so when we manipulate scalars, we usually replicate to both bottom and top halves of 32-bit registers. Add an analysis pass that detects replication. Then, use that replication pass to optimize out useless swizzle instructions (by changing them to plain moves, which can be copypropped). This optimization is a slight shader-db win on its own, and allows us to transition to lower_bool_to_bitsize without regressing shader-db. total instructions in shared programs: 90323 -> 90257 (-0.07%) instructions in affected programs: 2513 -> 2447 (-2.63%) helped: 20 HURT: 0 helped stats (abs) min: 1.0 max: 16.0 x̄: 3.30 x̃: 2 helped stats (rel) min: 1.25% max: 11.11% x̄: 4.80% x̃: 4.29% 95% mean confidence interval for instructions value: -5.05 -1.55 95% mean confidence interval for instructions %-change: -6.06% -3.54% Instructions are helped. total tuples in shared programs: 73769 -> 73740 (-0.04%) tuples in affected programs: 1611 -> 1582 (-1.80%) helped: 17 HURT: 0 helped stats (abs) min: 1.0 max: 9.0 x̄: 1.71 x̃: 1 helped stats (rel) min: 0.58% max: 16.67% x̄: 4.80% x̃: 3.33% 95% mean confidence interval for tuples value: -2.70 -0.71 95% mean confidence interval for tuples %-change: -7.06% -2.54% Tuples are helped. total clauses in shared programs: 15997 -> 15993 (-0.03%) clauses in affected programs: 27 -> 23 (-14.81%) helped: 4 HURT: 0 helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 helped stats (rel) min: 7.69% max: 25.00% x̄: 18.17% x̃: 20.00% 95% mean confidence interval for clauses value: -1.00 -1.00 95% mean confidence interval for clauses %-change: -29.91% -6.44% Clauses are helped. total cycles in shared programs: 7623.13 -> 7622.13 (-0.01%) cycles in affected programs: 64.83 -> 63.83 (-1.54%) helped: 13 HURT: 0 helped stats (abs) min: 0.0416660000000002 max: 0.375 x̄: 0.08 x̃: 0 helped stats (rel) min: 1.02% max: 5.56% x̄: 2.82% x̃: 2.50% 95% mean confidence interval for cycles value: -0.13 -0.02 95% mean confidence interval for cycles %-change: -3.79% -1.85% Cycles are helped. total arith in shared programs: 2763.75 -> 2762.46 (-0.05%) arith in affected programs: 67.17 -> 65.88 (-1.92%) helped: 18 HURT: 0 helped stats (abs) min: 0.0416660000000002 max: 0.375 x̄: 0.07 x̃: 0 helped stats (rel) min: 1.02% max: 22.22% x̄: 5.68% x̃: 3.16% 95% mean confidence interval for arith value: -0.11 -0.03 95% mean confidence interval for arith %-change: -8.56% -2.80% Arith are helped. total quadwords in shared programs: 68173 -> 68155 (-0.03%) quadwords in affected programs: 1258 -> 1240 (-1.43%) helped: 14 HURT: 0 helped stats (abs) min: 1.0 max: 3.0 x̄: 1.29 x̃: 1 helped stats (rel) min: 0.42% max: 8.70% x̄: 3.88% x̃: 3.67% 95% mean confidence interval for quadwords value: -1.64 -0.93 95% mean confidence interval for quadwords %-change: -5.27% -2.49% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14576>
2022-01-15 17:26:42 +00:00
/* An even stronger equivalence that checks if indices correspond to the
* right value when evaluated
*/
static inline bool
bi_is_value_equiv(bi_index left, bi_index right)
{
if (left.type == BI_INDEX_CONSTANT && right.type == BI_INDEX_CONSTANT) {
return (bi_apply_swizzle(left.value, left.swizzle) ==
bi_apply_swizzle(right.value, right.swizzle)) &&
(left.abs == right.abs) &&
(left.neg == right.neg);
} else {
return (left.value == right.value) &&
(left.abs == right.abs) &&
(left.neg == right.neg) &&
(left.swizzle == right.swizzle) &&
(left.offset == right.offset) &&
(left.reg == right.reg) &&
(left.type == right.type);
}
}
#define BI_MAX_VEC 8
#define BI_MAX_DESTS 4
#define BI_MAX_SRCS 6
typedef struct {
/* Must be first */
struct list_head link;
enum bi_opcode op;
uint8_t nr_srcs;
uint8_t nr_dests;
/* Data flow */
bi_index dest[BI_MAX_DESTS];
bi_index src[BI_MAX_SRCS];
/* For a branch */
struct bi_block *branch_target;
/* These don't fit neatly with anything else.. */
enum bi_register_format register_format;
enum bi_vecsize vecsize;
/* Flow control associated with a Valhall instruction */
uint8_t flow;
/* Slot associated with a message-passing instruction */
uint8_t slot;
/* Can we spill the value written here? Used to prevent
* useless double fills */
bool no_spill;
/* On Bifrost: A value of bi_table to override the table, inducing a
* DTSEL_IMM pair if nonzero.
*
* On Valhall: the table index to use for resource instructions.
*
* These two interpretations are equivalent if you squint a bit.
*/
unsigned table;
/* Everything after this MUST NOT be accessed directly, since
* interpretation depends on opcodes */
/* Destination modifiers */
union {
enum bi_clamp clamp;
bool saturate;
bool not_result;
pan/bi: Add a constant subexpression elimination pass ALU only. Intended to clean up the lowerings used with complex texturings. Ex: if a shader reads two cube maps at the same coordinates, this deduplicates the cube map transformation. This needs to happen in the backend since we do the cube map transformation with the backend builder, rather than special NIR ops. This is a tradeoff. Pass based on ir3's, which in turn is inspired by NIR's. total instructions in shared programs: 148799 -> 147348 (-0.98%) instructions in affected programs: 20509 -> 19058 (-7.07%) helped: 145 HURT: 0 helped stats (abs) min: 4.0 max: 30.0 x̄: 10.01 x̃: 8 helped stats (rel) min: 1.92% max: 54.55% x̄: 10.87% x̃: 7.41% 95% mean confidence interval for instructions value: -10.73 -9.28 95% mean confidence interval for instructions %-change: -12.81% -8.94% Instructions are helped. total tuples in shared programs: 129992 -> 128908 (-0.83%) tuples in affected programs: 17624 -> 16540 (-6.15%) helped: 145 HURT: 0 helped stats (abs) min: 2.0 max: 25.0 x̄: 7.48 x̃: 7 helped stats (rel) min: 0.74% max: 42.86% x̄: 9.16% x̃: 7.22% 95% mean confidence interval for tuples value: -7.96 -6.99 95% mean confidence interval for tuples %-change: -10.52% -7.79% Tuples are helped. total clauses in shared programs: 27632 -> 27582 (-0.18%) clauses in affected programs: 1077 -> 1027 (-4.64%) helped: 44 HURT: 0 helped stats (abs) min: 1.0 max: 3.0 x̄: 1.14 x̃: 1 helped stats (rel) min: 2.50% max: 16.67% x̄: 4.99% x̃: 4.45% 95% mean confidence interval for clauses value: -1.26 -1.01 95% mean confidence interval for clauses %-change: -5.70% -4.27% Clauses are helped. total cycles in shared programs: 12323 -> 12285.63 (-0.30%) cycles in affected programs: 618.25 -> 580.88 (-6.05%) helped: 120 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 0.5416680000000014 x̄: 0.31 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 7.60% x̃: 7.37% 95% mean confidence interval for cycles value: -0.33 -0.29 95% mean confidence interval for cycles %-change: -8.73% -6.47% Cycles are helped. total arith in shared programs: 4916.75 -> 4866.88 (-1.01%) arith in affected programs: 677.79 -> 627.92 (-7.36%) helped: 145 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 1.0833329999999997 x̄: 0.34 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 12.81% x̃: 7.87% 95% mean confidence interval for arith value: -0.37 -0.32 95% mean confidence interval for arith %-change: -15.33% -10.29% Arith are helped. total quadwords in shared programs: 118117 -> 117262 (-0.72%) quadwords in affected programs: 15283 -> 14428 (-5.59%) helped: 143 HURT: 0 helped stats (abs) min: 1.0 max: 23.0 x̄: 5.98 x̃: 5 helped stats (rel) min: 0.44% max: 25.71% x̄: 7.56% x̃: 5.56% 95% mean confidence interval for quadwords value: -6.46 -5.50 95% mean confidence interval for quadwords %-change: -8.59% -6.53% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11733>
2021-06-25 00:36:11 +01:00
unsigned dest_mod;
};
/* Immediates. All seen alone in an instruction, except for varying/texture
* which are specified jointly for VARTEX */
union {
uint32_t shift;
uint32_t fill;
uint32_t index;
uint32_t attribute_index;
struct {
uint32_t varying_index;
uint32_t sampler_index;
uint32_t texture_index;
};
/* TEXC, ATOM_CX: # of staging registers used */
struct {
uint32_t sr_count;
uint32_t sr_count_2;
union {
/* Atomics effectively require all three */
int32_t byte_offset;
/* BLEND requires all three */
int32_t branch_offset;
};
};
};
/* Modifiers specific to particular instructions are thrown in a union */
union {
enum bi_adj adj; /* FEXP_TABLE.u4 */
enum bi_atom_opc atom_opc; /* atomics */
enum bi_func func; /* FPOW_SC_DET */
enum bi_function function; /* LD_VAR_FLAT */
enum bi_mux mux; /* MUX */
enum bi_sem sem; /* FMAX, FMIN */
enum bi_source source; /* LD_GCLK */
bool scale; /* VN_ASST2, FSINCOS_OFFSET */
bool offset; /* FSIN_TABLE, FOCS_TABLE */
bool mask; /* CLZ */
bool threads; /* IMULD, IMOV_FMA */
bool combine; /* BRANCHC */
bool format; /* LEA_TEX */
struct {
enum bi_special special; /* FADD_RSCALE, FMA_RSCALE */
enum bi_round round; /* FMA, converts, FADD, _RSCALE, etc */
bool ftz; /* Flush-to-zero for F16_TO_F32 */
};
struct {
enum bi_result_type result_type; /* FCMP, ICMP */
enum bi_cmpf cmpf; /* CSEL, FCMP, ICMP, BRANCH */
};
struct {
enum bi_stack_mode stack_mode; /* JUMP_EX */
bool test_mode;
};
struct {
enum bi_seg seg; /* LOAD, STORE, SEG_ADD, SEG_SUB */
bool preserve_null; /* SEG_ADD, SEG_SUB */
enum bi_extend extend; /* LOAD, IMUL */
};
struct {
enum bi_sample sample; /* VAR_TEX, LD_VAR */
enum bi_update update; /* VAR_TEX, LD_VAR */
enum bi_varying_name varying_name; /* LD_VAR_SPECIAL */
bool skip; /* VAR_TEX, TEXS, TEXC */
bool lod_mode; /* VAR_TEX, TEXS, implicitly for TEXC */
enum bi_source_format source_format; /* LD_VAR_BUF */
/* Used for valhall texturing */
bool shadow;
bool texel_offset;
bool array_enable;
bool integer_coordinates;
enum bi_fetch_component fetch_component;
enum bi_va_lod_mode va_lod_mode;
enum bi_dimension dimension;
enum bi_write_mask write_mask;
};
pan/bi: Add a constant subexpression elimination pass ALU only. Intended to clean up the lowerings used with complex texturings. Ex: if a shader reads two cube maps at the same coordinates, this deduplicates the cube map transformation. This needs to happen in the backend since we do the cube map transformation with the backend builder, rather than special NIR ops. This is a tradeoff. Pass based on ir3's, which in turn is inspired by NIR's. total instructions in shared programs: 148799 -> 147348 (-0.98%) instructions in affected programs: 20509 -> 19058 (-7.07%) helped: 145 HURT: 0 helped stats (abs) min: 4.0 max: 30.0 x̄: 10.01 x̃: 8 helped stats (rel) min: 1.92% max: 54.55% x̄: 10.87% x̃: 7.41% 95% mean confidence interval for instructions value: -10.73 -9.28 95% mean confidence interval for instructions %-change: -12.81% -8.94% Instructions are helped. total tuples in shared programs: 129992 -> 128908 (-0.83%) tuples in affected programs: 17624 -> 16540 (-6.15%) helped: 145 HURT: 0 helped stats (abs) min: 2.0 max: 25.0 x̄: 7.48 x̃: 7 helped stats (rel) min: 0.74% max: 42.86% x̄: 9.16% x̃: 7.22% 95% mean confidence interval for tuples value: -7.96 -6.99 95% mean confidence interval for tuples %-change: -10.52% -7.79% Tuples are helped. total clauses in shared programs: 27632 -> 27582 (-0.18%) clauses in affected programs: 1077 -> 1027 (-4.64%) helped: 44 HURT: 0 helped stats (abs) min: 1.0 max: 3.0 x̄: 1.14 x̃: 1 helped stats (rel) min: 2.50% max: 16.67% x̄: 4.99% x̃: 4.45% 95% mean confidence interval for clauses value: -1.26 -1.01 95% mean confidence interval for clauses %-change: -5.70% -4.27% Clauses are helped. total cycles in shared programs: 12323 -> 12285.63 (-0.30%) cycles in affected programs: 618.25 -> 580.88 (-6.05%) helped: 120 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 0.5416680000000014 x̄: 0.31 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 7.60% x̃: 7.37% 95% mean confidence interval for cycles value: -0.33 -0.29 95% mean confidence interval for cycles %-change: -8.73% -6.47% Cycles are helped. total arith in shared programs: 4916.75 -> 4866.88 (-1.01%) arith in affected programs: 677.79 -> 627.92 (-7.36%) helped: 145 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 1.0833329999999997 x̄: 0.34 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 12.81% x̃: 7.87% 95% mean confidence interval for arith value: -0.37 -0.32 95% mean confidence interval for arith %-change: -15.33% -10.29% Arith are helped. total quadwords in shared programs: 118117 -> 117262 (-0.72%) quadwords in affected programs: 15283 -> 14428 (-5.59%) helped: 143 HURT: 0 helped stats (abs) min: 1.0 max: 23.0 x̄: 5.98 x̃: 5 helped stats (rel) min: 0.44% max: 25.71% x̄: 7.56% x̃: 5.56% 95% mean confidence interval for quadwords value: -6.46 -5.50 95% mean confidence interval for quadwords %-change: -8.59% -6.53% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11733>
2021-06-25 00:36:11 +01:00
/* Maximum size, for hashing */
unsigned flags[14];
pan/bi: Add a constant subexpression elimination pass ALU only. Intended to clean up the lowerings used with complex texturings. Ex: if a shader reads two cube maps at the same coordinates, this deduplicates the cube map transformation. This needs to happen in the backend since we do the cube map transformation with the backend builder, rather than special NIR ops. This is a tradeoff. Pass based on ir3's, which in turn is inspired by NIR's. total instructions in shared programs: 148799 -> 147348 (-0.98%) instructions in affected programs: 20509 -> 19058 (-7.07%) helped: 145 HURT: 0 helped stats (abs) min: 4.0 max: 30.0 x̄: 10.01 x̃: 8 helped stats (rel) min: 1.92% max: 54.55% x̄: 10.87% x̃: 7.41% 95% mean confidence interval for instructions value: -10.73 -9.28 95% mean confidence interval for instructions %-change: -12.81% -8.94% Instructions are helped. total tuples in shared programs: 129992 -> 128908 (-0.83%) tuples in affected programs: 17624 -> 16540 (-6.15%) helped: 145 HURT: 0 helped stats (abs) min: 2.0 max: 25.0 x̄: 7.48 x̃: 7 helped stats (rel) min: 0.74% max: 42.86% x̄: 9.16% x̃: 7.22% 95% mean confidence interval for tuples value: -7.96 -6.99 95% mean confidence interval for tuples %-change: -10.52% -7.79% Tuples are helped. total clauses in shared programs: 27632 -> 27582 (-0.18%) clauses in affected programs: 1077 -> 1027 (-4.64%) helped: 44 HURT: 0 helped stats (abs) min: 1.0 max: 3.0 x̄: 1.14 x̃: 1 helped stats (rel) min: 2.50% max: 16.67% x̄: 4.99% x̃: 4.45% 95% mean confidence interval for clauses value: -1.26 -1.01 95% mean confidence interval for clauses %-change: -5.70% -4.27% Clauses are helped. total cycles in shared programs: 12323 -> 12285.63 (-0.30%) cycles in affected programs: 618.25 -> 580.88 (-6.05%) helped: 120 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 0.5416680000000014 x̄: 0.31 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 7.60% x̃: 7.37% 95% mean confidence interval for cycles value: -0.33 -0.29 95% mean confidence interval for cycles %-change: -8.73% -6.47% Cycles are helped. total arith in shared programs: 4916.75 -> 4866.88 (-1.01%) arith in affected programs: 677.79 -> 627.92 (-7.36%) helped: 145 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 1.0833329999999997 x̄: 0.34 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 12.81% x̃: 7.87% 95% mean confidence interval for arith value: -0.37 -0.32 95% mean confidence interval for arith %-change: -15.33% -10.29% Arith are helped. total quadwords in shared programs: 118117 -> 117262 (-0.72%) quadwords in affected programs: 15283 -> 14428 (-5.59%) helped: 143 HURT: 0 helped stats (abs) min: 1.0 max: 23.0 x̄: 5.98 x̃: 5 helped stats (rel) min: 0.44% max: 25.71% x̄: 7.56% x̃: 5.56% 95% mean confidence interval for quadwords value: -6.46 -5.50 95% mean confidence interval for quadwords %-change: -8.59% -6.53% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11733>
2021-06-25 00:36:11 +01:00
struct {
enum bi_subgroup subgroup; /* WMASK, CLPER */
enum bi_inactive_result inactive_result; /* CLPER */
enum bi_lane_op lane_op; /* CLPER */
};
struct {
bool z; /* ZS_EMIT */
bool stencil; /* ZS_EMIT */
};
struct {
bool h; /* VN_ASST1.f16 */
bool l; /* VN_ASST1.f16 */
};
struct {
bool bytes2; /* RROT_DOUBLE, FRSHIFT_DOUBLE */
bool result_word;
bool arithmetic; /* ARSHIFT_OR */
};
struct {
bool sqrt; /* FREXPM */
bool log; /* FREXPM */
};
struct {
enum bi_mode mode; /* FLOG_TABLE */
enum bi_precision precision; /* FLOG_TABLE */
bool divzero; /* FRSQ_APPROX, FRSQ */
};
};
} bi_instr;
static inline bool
bi_is_staging_src(const bi_instr *I, unsigned s)
{
return (s == 0 || s == 4) && bi_opcode_props[I->op].sr_read;
}
/* Represents the assignment of slots for a given bi_tuple */
typedef struct {
/* Register to assign to each slot */
unsigned slot[4];
/* Read slots can be disabled */
bool enabled[2];
/* Configuration for slots 2/3 */
struct bifrost_reg_ctrl_23 slot23;
/* Fast-Access-Uniform RAM index */
uint8_t fau_idx;
/* Whether writes are actually for the last instruction */
bool first_instruction;
} bi_registers;
/* A bi_tuple contains two paired instruction pointers. If a slot is unfilled,
* leave it NULL; the emitter will fill in a nop. Instructions reference
* registers via slots which are assigned per tuple.
*/
typedef struct {
uint8_t fau_idx;
bi_registers regs;
bi_instr *fma;
bi_instr *add;
} bi_tuple;
struct bi_block;
typedef struct {
struct list_head link;
/* Link back up for branch calculations */
struct bi_block *block;
/* Architectural limit of 8 tuples/clause */
unsigned tuple_count;
bi_tuple tuples[8];
/* For scoreboarding -- the clause ID (this is not globally unique!)
* and its dependencies in terms of other clauses, computed during
* scheduling and used when emitting code. Dependencies expressed as a
* bitfield matching the hardware, except shifted by a clause (the
* shift back to the ISA's off-by-one encoding is worked out when
* emitting clauses) */
unsigned scoreboard_id;
uint8_t dependencies;
/* See ISA header for description */
enum bifrost_flow flow_control;
/* Can we prefetch the next clause? Usually it makes sense, except for
* clauses ending in unconditional branches */
bool next_clause_prefetch;
/* Assigned data register */
unsigned staging_register;
/* Corresponds to the usual bit but shifted by a clause */
bool staging_barrier;
/* Constants read by this clause. ISA limit. Must satisfy:
*
* constant_count + tuple_count <= 13
*
* Also implicitly constant_count <= tuple_count since a tuple only
* reads a single constant.
*/
uint64_t constants[8];
unsigned constant_count;
/* Index of a constant to be PC-relative */
unsigned pcrel_idx;
/* Branches encode a constant offset relative to the program counter
* with some magic flags. By convention, if there is a branch, its
* constant will be last. Set this flag to indicate this is required.
*/
bool branch_constant;
/* Unique in a clause */
enum bifrost_message_type message_type;
bi_instr *message;
/* Discard helper threads */
bool td;
/* Should flush-to-zero mode be enabled for this clause? */
bool ftz;
} bi_clause;
#define BI_NUM_SLOTS 8
/* A model for the state of the scoreboard */
struct bi_scoreboard_state {
/** Bitmap of registers read/written by a slot */
uint64_t read[BI_NUM_SLOTS];
uint64_t write[BI_NUM_SLOTS];
/* Nonregister dependencies present by a slot */
uint8_t varying : BI_NUM_SLOTS;
uint8_t memory : BI_NUM_SLOTS;
};
typedef struct bi_block {
/* Link to next block. Must be first for mir_get_block */
struct list_head link;
/* List of instructions emitted for the current block */
struct list_head instructions;
/* Index of the block in source order */
unsigned index;
/* Control flow graph */
struct bi_block *successors[2];
struct util_dynarray predecessors;
bool unconditional_jumps;
/* Per 32-bit word live masks for the block indexed by node */
uint8_t *live_in;
uint8_t *live_out;
/* If true, uses clauses; if false, uses instructions */
bool scheduled;
struct list_head clauses; /* list of bi_clause */
/* Post-RA liveness */
uint64_t reg_live_in, reg_live_out;
/* Scoreboard state at the start/end of block */
struct bi_scoreboard_state scoreboard_in, scoreboard_out;
/* On Valhall, indicates we need a terminal NOP to implement jumps to
* the end of the shader.
*/
bool needs_nop;
/* Flags available for pass-internal use */
uint8_t pass_flags;
} bi_block;
static inline unsigned
bi_num_predecessors(bi_block *block)
{
return util_dynarray_num_elements(&block->predecessors, bi_block *);
}
static inline bi_block *
bi_start_block(struct list_head *blocks)
{
bi_block *first = list_first_entry(blocks, bi_block, link);
assert(bi_num_predecessors(first) == 0);
return first;
}
static inline bi_block *
bi_exit_block(struct list_head *blocks)
{
bi_block *last = list_last_entry(blocks, bi_block, link);
assert(!last->successors[0] && !last->successors[1]);
return last;
}
static inline void
bi_block_add_successor(bi_block *block, bi_block *successor)
{
assert(block != NULL && successor != NULL);
/* Cull impossible edges */
if (block->unconditional_jumps)
return;
for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
if (block->successors[i]) {
if (block->successors[i] == successor)
return;
else
continue;
}
block->successors[i] = successor;
util_dynarray_append(&successor->predecessors, bi_block *, block);
return;
}
unreachable("Too many successors");
}
/* Subset of pan_shader_info needed per-variant, in order to support IDVS */
struct bi_shader_info {
struct panfrost_ubo_push *push;
struct bifrost_shader_info *bifrost;
struct panfrost_sysvals *sysvals;
unsigned tls_size;
unsigned work_reg_count;
pan/bi: Reorder pushed uniforms to avoid moves On Bifrost and Valhall, push uniforms are loaded into Fast Access Uniform Random Access Memory (FAU-RAM). FAU-RAM is organized as an array of 64-bit slots. A given tuple (Bifrost) or instruction (Valhall) may access at most a single 64-bit slot. If an instruction requires uniforms from multiple 64-bit slots, a uniform-to-register move must be inserted to avoid the hazard. However, if an instruction requires a pair of 32-bit uniforms from the same 64-bit slot, no move is required. To reduce the number of moves we emit, this commit adds an optimization pass that reorders pushed uniforms, trying to group uniforms used by the same instruction. The pass works by creating a graph of pushed uniforms, where edges denote the "both 32-bit uniforms required by the same instruction" relationship. We perform depth-first search on this graph to find the connected components, where each connected component is a cluster of uniforms that are used together. We then select pairs of uniforms from each connected component. The remaining unpaired uniforms (from components of odd sizes) are paired together arbitrarily. In principle, we should weight the graph by number of occurences and choose pairs that maximize the total selected edge weight. This is left for future work, as it is nontrivial -- selecting these edges optimally appears to be NP-hard at first blush. Implementation note: As position and varying shaders share FAU on Bifrost, extra care is taken with a `push_offset` shader stage info parameter that ensures varying shaders do not reorder uniforms selected by the previous position shader. total instructions in shared programs: 2503343 -> 2451758 (-2.06%) instructions in affected programs: 1553309 -> 1501724 (-3.32%) helped: 14256 HURT: 8 helped stats (abs) min: 1.0 max: 80.0 x̄: 3.62 x̃: 3 helped stats (rel) min: 0.06% max: 36.36% x̄: 7.31% x̃: 6.67% HURT stats (abs) min: 1.0 max: 2.0 x̄: 1.38 x̃: 1 HURT stats (rel) min: 1.30% max: 12.50% x̄: 4.99% x̃: 3.85% 95% mean confidence interval for instructions value: -3.66 -3.58 95% mean confidence interval for instructions %-change: -7.41% -7.20% Instructions are helped. total tuples in shared programs: 2008399 -> 1969627 (-1.93%) tuples in affected programs: 1146344 -> 1107572 (-3.38%) helped: 12867 HURT: 147 helped stats (abs) min: 1.0 max: 61.0 x̄: 3.03 x̃: 2 helped stats (rel) min: 0.17% max: 42.86% x̄: 6.79% x̃: 4.65% HURT stats (abs) min: 1.0 max: 3.0 x̄: 1.20 x̃: 1 HURT stats (rel) min: 0.29% max: 20.00% x̄: 2.12% x̃: 1.19% 95% mean confidence interval for tuples value: -3.03 -2.93 95% mean confidence interval for tuples %-change: -6.82% -6.57% Tuples are helped. total clauses in shared programs: 408005 -> 401708 (-1.54%) clauses in affected programs: 90760 -> 84463 (-6.94%) helped: 6006 HURT: 164 helped stats (abs) min: 1.0 max: 9.0 x̄: 1.08 x̃: 1 helped stats (rel) min: 0.45% max: 33.33% x̄: 12.44% x̃: 14.29% HURT stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 HURT stats (rel) min: 1.64% max: 25.00% x̄: 9.81% x̃: 5.26% 95% mean confidence interval for clauses value: -1.03 -1.01 95% mean confidence interval for clauses %-change: -12.03% -11.66% Clauses are helped. total cycles in shared programs: 203308.37 -> 202737.83 (-0.28%) cycles in affected programs: 19264.71 -> 18694.17 (-2.96%) helped: 3024 HURT: 41 helped stats (abs) min: 0.041665999999999315 max: 2.5416680000000014 x̄: 0.19 x̃: 0 helped stats (rel) min: 0.17% max: 33.33% x̄: 3.83% x̃: 2.83% HURT stats (abs) min: 0.041665999999999315 max: 0.125 x̄: 0.06 x̃: 0 HURT stats (rel) min: 0.30% max: 5.88% x̄: 1.41% x̃: 0.93% 95% mean confidence interval for cycles value: -0.19 -0.18 95% mean confidence interval for cycles %-change: -3.89% -3.64% Cycles are helped. total arith in shared programs: 76265.67 -> 74669.25 (-2.09%) arith in affected programs: 45001.50 -> 43405.08 (-3.55%) helped: 12945 HURT: 97 helped stats (abs) min: 0.041665999999999315 max: 2.5416680000000014 x̄: 0.12 x̃: 0 helped stats (rel) min: 0.17% max: 50.00% x̄: 8.06% x̃: 4.88% HURT stats (abs) min: 0.041665999999999315 max: 0.125 x̄: 0.05 x̃: 0 HURT stats (rel) min: 0.21% max: 33.33% x̄: 2.16% x̃: 0.96% 95% mean confidence interval for arith value: -0.12 -0.12 95% mean confidence interval for arith %-change: -8.16% -7.81% Arith are helped. total quadwords in shared programs: 1796563 -> 1766803 (-1.66%) quadwords in affected programs: 948830 -> 919070 (-3.14%) helped: 12078 HURT: 219 helped stats (abs) min: 1.0 max: 42.0 x̄: 2.49 x̃: 2 helped stats (rel) min: 0.10% max: 33.33% x̄: 5.57% x̃: 5.26% HURT stats (abs) min: 1.0 max: 4.0 x̄: 1.21 x̃: 1 HURT stats (rel) min: 0.33% max: 6.67% x̄: 2.00% x̃: 1.14% 95% mean confidence interval for quadwords value: -2.46 -2.38 95% mean confidence interval for quadwords %-change: -5.52% -5.36% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14163>
2021-12-11 17:54:01 +00:00
unsigned push_offset;
};
/* State of index-driven vertex shading for current shader */
enum bi_idvs_mode {
/* IDVS not in use */
BI_IDVS_NONE = 0,
/* IDVS in use. Compiling a position shader */
BI_IDVS_POSITION = 1,
/* IDVS in use. Compiling a varying shader */
BI_IDVS_VARYING = 2,
};
typedef struct {
const struct panfrost_compile_inputs *inputs;
nir_shader *nir;
struct bi_shader_info info;
gl_shader_stage stage;
struct list_head blocks; /* list of bi_block */
struct hash_table_u64 *sysval_to_id;
uint32_t quirks;
unsigned arch;
enum bi_idvs_mode idvs;
unsigned num_blocks;
/* In any graphics shader, whether the "IDVS with memory
* allocation" flow is used. This affects how varyings are loaded and
* stored. Ignore for compute.
*/
bool malloc_idvs;
/* During NIR->BIR */
bi_block *current_block;
bi_block *after_block;
bi_block *break_block;
bi_block *continue_block;
bool emitted_atest;
/* During NIR->BIR, the coverage bitmap. If this is NULL, the default
* coverage bitmap should be source from preloaded register r60. This is
* written by ATEST and ZS_EMIT
*/
bi_index coverage;
/* During NIR->BIR, table of preloaded registers, or NULL if never
* preloaded.
*/
bi_index preloaded[64];
/* For creating temporaries */
unsigned ssa_alloc;
unsigned reg_alloc;
/* Mask of UBOs that need to be uploaded */
uint32_t ubo_mask;
/* During instruction selection, map from vector bi_index to its scalar
* components, populated by a split.
*/
struct hash_table_u64 *allocated_vec;
/* Stats for shader-db */
unsigned instruction_count;
unsigned loop_count;
unsigned spills;
unsigned fills;
} bi_context;
static inline void
bi_remove_instruction(bi_instr *ins)
{
list_del(&ins->link);
}
enum bir_fau {
BIR_FAU_ZERO = 0,
BIR_FAU_LANE_ID = 1,
BIR_FAU_WARP_ID = 2,
BIR_FAU_CORE_ID = 3,
BIR_FAU_FB_EXTENT = 4,
BIR_FAU_ATEST_PARAM = 5,
BIR_FAU_SAMPLE_POS_ARRAY = 6,
BIR_FAU_BLEND_0 = 8,
/* blend descs 1 - 7 */
BIR_FAU_TYPE_MASK = 15,
/* Valhall only */
BIR_FAU_TLS_PTR = 16,
BIR_FAU_WLS_PTR = 17,
BIR_FAU_PROGRAM_COUNTER = 18,
BIR_FAU_UNIFORM = (1 << 7),
/* Look up table on Valhall */
BIR_FAU_IMMEDIATE = (1 << 8),
};
static inline bi_index
bi_fau(enum bir_fau value, bool hi)
{
return (bi_index) {
.value = value,
.swizzle = BI_SWIZZLE_H01,
.offset = hi ? 1u : 0u,
.type = BI_INDEX_FAU,
};
}
/*
* Builder for Valhall LUT entries. Generally, constants are modeled with
* BI_INDEX_IMMEDIATE in the intermediate representation. This helper is only
* necessary for passes running after lowering constants, as well as when
* lowering constants.
*
*/
static inline bi_index
va_lut(unsigned index)
{
return bi_fau((enum bir_fau) (BIR_FAU_IMMEDIATE | (index >> 1)),
index & 1);
}
/*
* va_lut_zero is like bi_zero but only works on Valhall. It is intended for
* use by late passes that run after constants are lowered, specifically
* register allocation. bi_zero() is preferred where possible.
*/
static inline bi_index
va_zero_lut()
{
return va_lut(0);
}
static inline unsigned
bi_max_temp(bi_context *ctx)
{
return (MAX2(ctx->reg_alloc, ctx->ssa_alloc) + 2) << 1;
}
static inline bi_index
bi_temp(bi_context *ctx)
{
return bi_get_index(ctx->ssa_alloc++, false, 0);
}
static inline bi_index
bi_temp_reg(bi_context *ctx)
{
return bi_get_index(ctx->reg_alloc++, true, 0);
}
/* Inline constants automatically, will be lowered out by bi_lower_fau where a
* constant is not allowed. load_const_to_scalar gaurantees that this makes
* sense */
static inline bi_index
bi_src_index(nir_src *src)
{
if (nir_src_is_const(*src) && nir_src_bit_size(*src) <= 32)
return bi_imm_u32(nir_src_as_uint(*src));
else if (src->is_ssa)
return bi_get_index(src->ssa->index, false, 0);
else {
assert(!src->reg.indirect);
return bi_get_index(src->reg.reg->index, true, 0);
}
}
static inline bi_index
bi_dest_index(nir_dest *dst)
{
if (dst->is_ssa)
return bi_get_index(dst->ssa.index, false, 0);
else {
assert(!dst->reg.indirect);
return bi_get_index(dst->reg.reg->index, true, 0);
}
}
static inline unsigned
bi_get_node(bi_index index)
{
if (bi_is_null(index) || index.type != BI_INDEX_NORMAL)
return ~0;
else
return (index.value << 1) | index.reg;
}
static inline bi_index
bi_node_to_index(unsigned node, unsigned node_count)
{
assert(node < node_count);
assert(node_count < ~0u);
return bi_get_index(node >> 1, node & PAN_IS_REG, 0);
}
/* Iterators for Bifrost IR */
#define bi_foreach_block(ctx, v) \
list_for_each_entry(bi_block, v, &ctx->blocks, link)
#define bi_foreach_block_rev(ctx, v) \
list_for_each_entry_rev(bi_block, v, &ctx->blocks, link)
#define bi_foreach_block_from(ctx, from, v) \
list_for_each_entry_from(bi_block, v, from, &ctx->blocks, link)
#define bi_foreach_block_from_rev(ctx, from, v) \
list_for_each_entry_from_rev(bi_block, v, from, &ctx->blocks, link)
#define bi_foreach_instr_in_block(block, v) \
list_for_each_entry(bi_instr, v, &(block)->instructions, link)
#define bi_foreach_instr_in_block_rev(block, v) \
list_for_each_entry_rev(bi_instr, v, &(block)->instructions, link)
#define bi_foreach_instr_in_block_safe(block, v) \
list_for_each_entry_safe(bi_instr, v, &(block)->instructions, link)
#define bi_foreach_instr_in_block_safe_rev(block, v) \
list_for_each_entry_safe_rev(bi_instr, v, &(block)->instructions, link)
#define bi_foreach_instr_in_block_from(block, v, from) \
list_for_each_entry_from(bi_instr, v, from, &(block)->instructions, link)
#define bi_foreach_instr_in_block_from_rev(block, v, from) \
list_for_each_entry_from_rev(bi_instr, v, from, &(block)->instructions, link)
#define bi_foreach_clause_in_block(block, v) \
list_for_each_entry(bi_clause, v, &(block)->clauses, link)
#define bi_foreach_clause_in_block_rev(block, v) \
list_for_each_entry_rev(bi_clause, v, &(block)->clauses, link)
#define bi_foreach_clause_in_block_safe(block, v) \
list_for_each_entry_safe(bi_clause, v, &(block)->clauses, link)
#define bi_foreach_clause_in_block_from(block, v, from) \
list_for_each_entry_from(bi_clause, v, from, &(block)->clauses, link)
#define bi_foreach_clause_in_block_from_rev(block, v, from) \
list_for_each_entry_from_rev(bi_clause, v, from, &(block)->clauses, link)
#define bi_foreach_instr_global(ctx, v) \
bi_foreach_block(ctx, v_block) \
bi_foreach_instr_in_block(v_block, v)
pan/bi: Propagate fabs/neg/sat Initial support for modifier propagation. Bifrost makes this unreasonably hard. total instructions in shared programs: 151604 -> 150761 (-0.56%) instructions in affected programs: 48773 -> 47930 (-1.73%) helped: 212 HURT: 0 helped stats (abs) min: 1 max: 28 x̄: 3.98 x̃: 1 helped stats (rel) min: 0.29% max: 12.70% x̄: 1.75% x̃: 1.26% 95% mean confidence interval for instructions value: -4.71 -3.25 95% mean confidence interval for instructions %-change: -1.97% -1.53% Instructions are helped. total tuples in shared programs: 131876 -> 131560 (-0.24%) tuples in affected programs: 25393 -> 25077 (-1.24%) helped: 104 HURT: 3 helped stats (abs) min: 1 max: 28 x̄: 3.08 x̃: 2 helped stats (rel) min: 0.34% max: 8.57% x̄: 1.55% x̃: 1.04% HURT stats (abs) min: 1 max: 2 x̄: 1.33 x̃: 1 HURT stats (rel) min: 0.51% max: 2.86% x̄: 1.30% x̃: 0.53% 95% mean confidence interval for tuples value: -3.63 -2.28 95% mean confidence interval for tuples %-change: -1.73% -1.21% Tuples are helped. total clauses in shared programs: 28122 -> 28032 (-0.32%) clauses in affected programs: 2720 -> 2630 (-3.31%) helped: 58 HURT: 1 helped stats (abs) min: 1 max: 6 x̄: 1.57 x̃: 1 helped stats (rel) min: 0.88% max: 14.29% x̄: 4.06% x̃: 3.67% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 7.69% max: 7.69% x̄: 7.69% x̃: 7.69% 95% mean confidence interval for clauses value: -1.85 -1.20 95% mean confidence interval for clauses %-change: -4.60% -3.13% Clauses are helped. total quadwords in shared programs: 119778 -> 119509 (-0.22%) quadwords in affected programs: 20698 -> 20429 (-1.30%) helped: 95 HURT: 1 helped stats (abs) min: 1 max: 28 x̄: 2.85 x̃: 2 helped stats (rel) min: 0.38% max: 7.14% x̄: 1.50% x̃: 1.13% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 3.23% max: 3.23% x̄: 3.23% x̃: 3.23% 95% mean confidence interval for quadwords value: -3.49 -2.11 95% mean confidence interval for quadwords %-change: -1.71% -1.20% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11327>
2021-06-11 00:05:29 +01:00
#define bi_foreach_instr_global_rev(ctx, v) \
bi_foreach_block_rev(ctx, v_block) \
bi_foreach_instr_in_block_rev(v_block, v)
pan/bi: Propagate fabs/neg/sat Initial support for modifier propagation. Bifrost makes this unreasonably hard. total instructions in shared programs: 151604 -> 150761 (-0.56%) instructions in affected programs: 48773 -> 47930 (-1.73%) helped: 212 HURT: 0 helped stats (abs) min: 1 max: 28 x̄: 3.98 x̃: 1 helped stats (rel) min: 0.29% max: 12.70% x̄: 1.75% x̃: 1.26% 95% mean confidence interval for instructions value: -4.71 -3.25 95% mean confidence interval for instructions %-change: -1.97% -1.53% Instructions are helped. total tuples in shared programs: 131876 -> 131560 (-0.24%) tuples in affected programs: 25393 -> 25077 (-1.24%) helped: 104 HURT: 3 helped stats (abs) min: 1 max: 28 x̄: 3.08 x̃: 2 helped stats (rel) min: 0.34% max: 8.57% x̄: 1.55% x̃: 1.04% HURT stats (abs) min: 1 max: 2 x̄: 1.33 x̃: 1 HURT stats (rel) min: 0.51% max: 2.86% x̄: 1.30% x̃: 0.53% 95% mean confidence interval for tuples value: -3.63 -2.28 95% mean confidence interval for tuples %-change: -1.73% -1.21% Tuples are helped. total clauses in shared programs: 28122 -> 28032 (-0.32%) clauses in affected programs: 2720 -> 2630 (-3.31%) helped: 58 HURT: 1 helped stats (abs) min: 1 max: 6 x̄: 1.57 x̃: 1 helped stats (rel) min: 0.88% max: 14.29% x̄: 4.06% x̃: 3.67% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 7.69% max: 7.69% x̄: 7.69% x̃: 7.69% 95% mean confidence interval for clauses value: -1.85 -1.20 95% mean confidence interval for clauses %-change: -4.60% -3.13% Clauses are helped. total quadwords in shared programs: 119778 -> 119509 (-0.22%) quadwords in affected programs: 20698 -> 20429 (-1.30%) helped: 95 HURT: 1 helped stats (abs) min: 1 max: 28 x̄: 2.85 x̃: 2 helped stats (rel) min: 0.38% max: 7.14% x̄: 1.50% x̃: 1.13% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 3.23% max: 3.23% x̄: 3.23% x̃: 3.23% 95% mean confidence interval for quadwords value: -3.49 -2.11 95% mean confidence interval for quadwords %-change: -1.71% -1.20% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11327>
2021-06-11 00:05:29 +01:00
#define bi_foreach_instr_global_safe(ctx, v) \
bi_foreach_block(ctx, v_block) \
bi_foreach_instr_in_block_safe(v_block, v)
#define bi_foreach_instr_global_rev_safe(ctx, v) \
bi_foreach_block_rev(ctx, v_block) \
bi_foreach_instr_in_block_rev_safe(v_block, v)
#define bi_foreach_instr_in_tuple(tuple, v) \
for (bi_instr *v = (tuple)->fma ?: (tuple)->add; \
v != NULL; \
v = (v == (tuple)->add) ? NULL : (tuple)->add)
#define bi_foreach_successor(blk, v) \
bi_block *v; \
bi_block **_v; \
for (_v = &blk->successors[0], \
v = *_v; \
v != NULL && _v < &blk->successors[2]; \
_v++, v = *_v) \
#define bi_foreach_predecessor(blk, v) \
util_dynarray_foreach(&(blk)->predecessors, bi_block *, v)
#define bi_foreach_src(ins, v) \
for (unsigned v = 0; v < ARRAY_SIZE(ins->src); ++v)
#define bi_foreach_dest(ins, v) \
for (unsigned v = 0; v < ARRAY_SIZE(ins->dest); ++v)
#define bi_foreach_instr_and_src_in_tuple(tuple, ins, s) \
bi_foreach_instr_in_tuple(tuple, ins) \
bi_foreach_src(ins, s)
static inline bi_instr *
bi_prev_op(bi_instr *ins)
{
return list_last_entry(&(ins->link), bi_instr, link);
}
static inline bi_instr *
bi_next_op(bi_instr *ins)
{
return list_first_entry(&(ins->link), bi_instr, link);
}
static inline bi_block *
bi_next_block(bi_block *block)
{
return list_first_entry(&(block->link), bi_block, link);
}
static inline bi_block *
bi_entry_block(bi_context *ctx)
{
return list_first_entry(&ctx->blocks, bi_block, link);
}
/* BIR manipulation */
bool bi_has_arg(const bi_instr *ins, bi_index arg);
unsigned bi_count_read_registers(const bi_instr *ins, unsigned src);
unsigned bi_count_write_registers(const bi_instr *ins, unsigned dest);
bool bi_is_regfmt_16(enum bi_register_format fmt);
unsigned bi_writemask(const bi_instr *ins, unsigned dest);
bi_clause * bi_next_clause(bi_context *ctx, bi_block *block, bi_clause *clause);
bool bi_side_effects(const bi_instr *I);
bool bi_reconverge_branches(bi_block *block);
bool bi_can_replace_with_csel(bi_instr *I);
void bi_replace_mux_with_csel(bi_instr *I, bool must_sign);
void bi_print_instr(const bi_instr *I, FILE *fp);
void bi_print_slots(bi_registers *regs, FILE *fp);
void bi_print_tuple(bi_tuple *tuple, FILE *fp);
void bi_print_clause(bi_clause *clause, FILE *fp);
void bi_print_block(bi_block *block, FILE *fp);
void bi_print_shader(bi_context *ctx, FILE *fp);
/* BIR passes */
bool bi_instr_uses_helpers(bi_instr *I);
bool bi_block_terminates_helpers(bi_block *block);
void bi_analyze_helper_terminate(bi_context *ctx);
void bi_mark_clauses_td(bi_context *ctx);
void bi_analyze_helper_requirements(bi_context *ctx);
void bi_opt_copy_prop(bi_context *ctx);
pan/bi: Add a constant subexpression elimination pass ALU only. Intended to clean up the lowerings used with complex texturings. Ex: if a shader reads two cube maps at the same coordinates, this deduplicates the cube map transformation. This needs to happen in the backend since we do the cube map transformation with the backend builder, rather than special NIR ops. This is a tradeoff. Pass based on ir3's, which in turn is inspired by NIR's. total instructions in shared programs: 148799 -> 147348 (-0.98%) instructions in affected programs: 20509 -> 19058 (-7.07%) helped: 145 HURT: 0 helped stats (abs) min: 4.0 max: 30.0 x̄: 10.01 x̃: 8 helped stats (rel) min: 1.92% max: 54.55% x̄: 10.87% x̃: 7.41% 95% mean confidence interval for instructions value: -10.73 -9.28 95% mean confidence interval for instructions %-change: -12.81% -8.94% Instructions are helped. total tuples in shared programs: 129992 -> 128908 (-0.83%) tuples in affected programs: 17624 -> 16540 (-6.15%) helped: 145 HURT: 0 helped stats (abs) min: 2.0 max: 25.0 x̄: 7.48 x̃: 7 helped stats (rel) min: 0.74% max: 42.86% x̄: 9.16% x̃: 7.22% 95% mean confidence interval for tuples value: -7.96 -6.99 95% mean confidence interval for tuples %-change: -10.52% -7.79% Tuples are helped. total clauses in shared programs: 27632 -> 27582 (-0.18%) clauses in affected programs: 1077 -> 1027 (-4.64%) helped: 44 HURT: 0 helped stats (abs) min: 1.0 max: 3.0 x̄: 1.14 x̃: 1 helped stats (rel) min: 2.50% max: 16.67% x̄: 4.99% x̃: 4.45% 95% mean confidence interval for clauses value: -1.26 -1.01 95% mean confidence interval for clauses %-change: -5.70% -4.27% Clauses are helped. total cycles in shared programs: 12323 -> 12285.63 (-0.30%) cycles in affected programs: 618.25 -> 580.88 (-6.05%) helped: 120 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 0.5416680000000014 x̄: 0.31 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 7.60% x̃: 7.37% 95% mean confidence interval for cycles value: -0.33 -0.29 95% mean confidence interval for cycles %-change: -8.73% -6.47% Cycles are helped. total arith in shared programs: 4916.75 -> 4866.88 (-1.01%) arith in affected programs: 677.79 -> 627.92 (-7.36%) helped: 145 HURT: 0 helped stats (abs) min: 0.08333299999999966 max: 1.0833329999999997 x̄: 0.34 x̃: 0 helped stats (rel) min: 0.77% max: 66.67% x̄: 12.81% x̃: 7.87% 95% mean confidence interval for arith value: -0.37 -0.32 95% mean confidence interval for arith %-change: -15.33% -10.29% Arith are helped. total quadwords in shared programs: 118117 -> 117262 (-0.72%) quadwords in affected programs: 15283 -> 14428 (-5.59%) helped: 143 HURT: 0 helped stats (abs) min: 1.0 max: 23.0 x̄: 5.98 x̃: 5 helped stats (rel) min: 0.44% max: 25.71% x̄: 7.56% x̃: 5.56% 95% mean confidence interval for quadwords value: -6.46 -5.50 95% mean confidence interval for quadwords %-change: -8.59% -6.53% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11733>
2021-06-25 00:36:11 +01:00
void bi_opt_cse(bi_context *ctx);
pan/bi: Propagate fabs/neg/sat Initial support for modifier propagation. Bifrost makes this unreasonably hard. total instructions in shared programs: 151604 -> 150761 (-0.56%) instructions in affected programs: 48773 -> 47930 (-1.73%) helped: 212 HURT: 0 helped stats (abs) min: 1 max: 28 x̄: 3.98 x̃: 1 helped stats (rel) min: 0.29% max: 12.70% x̄: 1.75% x̃: 1.26% 95% mean confidence interval for instructions value: -4.71 -3.25 95% mean confidence interval for instructions %-change: -1.97% -1.53% Instructions are helped. total tuples in shared programs: 131876 -> 131560 (-0.24%) tuples in affected programs: 25393 -> 25077 (-1.24%) helped: 104 HURT: 3 helped stats (abs) min: 1 max: 28 x̄: 3.08 x̃: 2 helped stats (rel) min: 0.34% max: 8.57% x̄: 1.55% x̃: 1.04% HURT stats (abs) min: 1 max: 2 x̄: 1.33 x̃: 1 HURT stats (rel) min: 0.51% max: 2.86% x̄: 1.30% x̃: 0.53% 95% mean confidence interval for tuples value: -3.63 -2.28 95% mean confidence interval for tuples %-change: -1.73% -1.21% Tuples are helped. total clauses in shared programs: 28122 -> 28032 (-0.32%) clauses in affected programs: 2720 -> 2630 (-3.31%) helped: 58 HURT: 1 helped stats (abs) min: 1 max: 6 x̄: 1.57 x̃: 1 helped stats (rel) min: 0.88% max: 14.29% x̄: 4.06% x̃: 3.67% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 7.69% max: 7.69% x̄: 7.69% x̃: 7.69% 95% mean confidence interval for clauses value: -1.85 -1.20 95% mean confidence interval for clauses %-change: -4.60% -3.13% Clauses are helped. total quadwords in shared programs: 119778 -> 119509 (-0.22%) quadwords in affected programs: 20698 -> 20429 (-1.30%) helped: 95 HURT: 1 helped stats (abs) min: 1 max: 28 x̄: 2.85 x̃: 2 helped stats (rel) min: 0.38% max: 7.14% x̄: 1.50% x̃: 1.13% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 3.23% max: 3.23% x̄: 3.23% x̃: 3.23% 95% mean confidence interval for quadwords value: -3.49 -2.11 95% mean confidence interval for quadwords %-change: -1.71% -1.20% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11327>
2021-06-11 00:05:29 +01:00
void bi_opt_mod_prop_forward(bi_context *ctx);
void bi_opt_mod_prop_backward(bi_context *ctx);
void bi_opt_dead_code_eliminate(bi_context *ctx);
void bi_opt_fuse_dual_texture(bi_context *ctx);
void bi_opt_dce_post_ra(bi_context *ctx);
pan/bi: Support message preloading Preload LD_VAR_IMM or VAR_TEX instructions in the first block of fragment shaders on v7. Preloaded messages write to fixed registers; when replacing instructions we insert moves from the registers at the start of the program and hope coalescing goes to town. (Admittedly we don't do any coalescing yet...) The extra moves hurts instruction count in some cases; the win for cycle count should cancel this out. When we get smarter copy prop or RA, those moves should go away anyway. This optimization may hurt register pressure by extending the lifetime of up to eight registers written in the first block. This is expected to be acceptable: on a large shader-db, there are no additional spills/fills, and only two shaders are hurt on thread count. This optimization only applies to v7, as the hardware was not introduced on v6 and was removed for Valhall. total instructions in shared programs: 2451624 -> 2454286 (0.11%) instructions in affected programs: 909046 -> 911708 (0.29%) helped: 4719 HURT: 3341 helped stats (abs) min: 1.0 max: 10.0 x̄: 1.49 x̃: 1 helped stats (rel) min: 0.08% max: 33.33% x̄: 6.79% x̃: 3.92% HURT stats (abs) min: 1.0 max: 50.0 x̄: 2.90 x̃: 2 HURT stats (rel) min: 0.12% max: 66.67% x̄: 6.39% x̃: 3.45% 95% mean confidence interval for instructions value: 0.27 0.39 95% mean confidence interval for instructions %-change: -1.55% -1.11% Inconclusive result (value mean confidence interval and %-change mean confidence interval disagree). total tuples in shared programs: 1969529 -> 1963429 (-0.31%) tuples in affected programs: 601327 -> 595227 (-1.01%) helped: 5907 HURT: 1297 helped stats (abs) min: 1.0 max: 8.0 x̄: 1.41 x̃: 1 helped stats (rel) min: 0.07% max: 33.33% x̄: 7.25% x̃: 5.26% HURT stats (abs) min: 1.0 max: 40.0 x̄: 1.73 x̃: 1 HURT stats (rel) min: 0.16% max: 31.75% x̄: 3.38% x̃: 2.02% 95% mean confidence interval for tuples value: -0.88 -0.81 95% mean confidence interval for tuples %-change: -5.52% -5.15% Tuples are helped. total clauses in shared programs: 401689 -> 387830 (-3.45%) clauses in affected programs: 136944 -> 123085 (-10.12%) helped: 8427 HURT: 4 helped stats (abs) min: 1.0 max: 4.0 x̄: 1.65 x̃: 2 helped stats (rel) min: 0.49% max: 50.00% x̄: 19.88% x̃: 18.18% HURT stats (abs) min: 1.0 max: 4.0 x̄: 2.50 x̃: 2 HURT stats (rel) min: 1.96% max: 19.05% x̄: 14.18% x̃: 17.86% 95% mean confidence interval for clauses value: -1.66 -1.63 95% mean confidence interval for clauses %-change: -20.15% -19.58% Clauses are helped. total cycles in shared programs: 202735.83 -> 201862.21 (-0.43%) cycles in affected programs: 16295.46 -> 15421.83 (-5.36%) helped: 3349 HURT: 1962 helped stats (abs) min: 0.041665999999999315 max: 1.0 x̄: 0.32 x̃: 0 helped stats (rel) min: 0.24% max: 100.00% x̄: 40.77% x̃: 33.33% HURT stats (abs) min: 0.041665999999999315 max: 1.5833329999999997 x̄: 0.10 x̃: 0 HURT stats (rel) min: 0.09% max: 31.40% x̄: 2.95% x̃: 1.94% 95% mean confidence interval for cycles value: -0.17 -0.16 95% mean confidence interval for cycles %-change: -25.48% -23.76% Cycles are helped. total arith in shared programs: 74665.50 -> 74920.00 (0.34%) arith in affected programs: 16059.92 -> 16314.42 (1.58%) helped: 860 HURT: 3409 helped stats (abs) min: 0.041665999999999315 max: 0.25 x̄: 0.06 x̃: 0 helped stats (rel) min: 0.24% max: 37.50% x̄: 4.73% x̃: 2.56% HURT stats (abs) min: 0.041665999999999315 max: 1.5833329999999997 x̄: 0.09 x̃: 0 HURT stats (rel) min: 0.09% max: 100.00% x̄: 8.99% x̃: 4.21% 95% mean confidence interval for arith value: 0.06 0.06 95% mean confidence interval for arith %-change: 5.83% 6.62% Arith are HURT. total texture in shared programs: 13083.50 -> 11877 (-9.22%) texture in affected programs: 1663 -> 456.50 (-72.55%) helped: 2377 HURT: 3 helped stats (abs) min: 0.5 max: 1.0 x̄: 0.51 x̃: 0 helped stats (rel) min: 6.25% max: 100.00% x̄: 87.12% x̃: 100.00% HURT stats (abs) min: 0.5 max: 0.5 x̄: 0.50 x̃: 0 HURT stats (rel) min: 0.00% max: 25.00% x̄: 16.67% x̃: 25.00% 95% mean confidence interval for texture value: -0.51 -0.50 95% mean confidence interval for texture %-change: -87.98% -86.00% Texture are helped. total vary in shared programs: 10220.62 -> 4183.88 (-59.06%) vary in affected programs: 10126.50 -> 4089.75 (-59.61%) helped: 8538 HURT: 0 helped stats (abs) min: 0.125 max: 1.0 x̄: 0.71 x̃: 0 helped stats (rel) min: 7.14% max: 100.00% x̄: 74.74% x̃: 87.50% 95% mean confidence interval for vary value: -0.71 -0.70 95% mean confidence interval for vary %-change: -75.32% -74.16% Vary are helped. total quadwords in shared programs: 1766717 -> 1757161 (-0.54%) quadwords in affected programs: 553801 -> 544245 (-1.73%) helped: 6760 HURT: 711 helped stats (abs) min: 1.0 max: 11.0 x̄: 1.58 x̃: 1 helped stats (rel) min: 0.09% max: 29.41% x̄: 5.31% x̃: 4.84% HURT stats (abs) min: 1.0 max: 33.0 x̄: 1.54 x̃: 1 HURT stats (rel) min: 0.10% max: 31.13% x̄: 2.53% x̃: 1.61% 95% mean confidence interval for quadwords value: -1.31 -1.25 95% mean confidence interval for quadwords %-change: -4.67% -4.46% Quadwords are helped. total threads in shared programs: 52899 -> 52897 (<.01%) threads in affected programs: 4 -> 2 (-50.00%) helped: 0 HURT: 2 total preloads in shared programs: 0 -> 116492 preloads in affected programs: 0 -> 116492 helped: 0 HURT: 8604 HURT stats (abs) min: 2.0 max: 24.0 x̄: 13.54 x̃: 14 HURT stats (rel) min: 0.00% max: 0.00% x̄: 0.00% x̃: 0.00% 95% mean confidence interval for preloads value: 13.45 13.63 95% mean confidence interval for preloads %-change: 0.00% 0.00% Preloads are HURT. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9438>
2022-02-23 18:50:54 +00:00
void bi_opt_message_preload(bi_context *ctx);
pan/bi: Push UBOs on Bifrost Based on the Midgard pass. Results look better since Midgard already had a basic UBO pushing pass to begin with. Particularly nice to see the dramatic reduction in spilling. total instructions in shared programs: 169141 -> 161215 (-4.69%) instructions in affected programs: 164102 -> 156176 (-4.83%) helped: 1269 HURT: 90 helped stats (abs) min: 1 max: 61 x̄: 6.50 x̃: 4 helped stats (rel) min: 0.15% max: 17.58% x̄: 6.31% x̃: 5.88% HURT stats (abs) min: 1 max: 170 x̄: 3.58 x̃: 1 HURT stats (rel) min: 0.08% max: 133.33% x̄: 16.65% x̃: 5.26% 95% mean confidence interval for instructions value: -6.28 -5.38 95% mean confidence interval for instructions %-change: -5.39% -4.18% Instructions are helped. total nops in shared programs: 121049 -> 120997 (-0.04%) nops in affected programs: 110024 -> 109972 (-0.05%) helped: 501 HURT: 758 helped stats (abs) min: 1 max: 45 x̄: 5.54 x̃: 2 helped stats (rel) min: 0.25% max: 47.06% x̄: 6.81% x̃: 4.55% HURT stats (abs) min: 1 max: 102 x̄: 3.59 x̃: 3 HURT stats (rel) min: 0.32% max: 50.00% x̄: 7.13% x̃: 6.06% 95% mean confidence interval for nops value: -0.45 0.37 95% mean confidence interval for nops %-change: 1.07% 2.09% Inconclusive result (value mean confidence interval includes 0). total clauses in shared programs: 40388 -> 31610 (-21.73%) clauses in affected programs: 38825 -> 30047 (-22.61%) helped: 1367 HURT: 2 helped stats (abs) min: 1 max: 58 x̄: 6.43 x̃: 5 helped stats (rel) min: 1.34% max: 55.56% x̄: 24.97% x̃: 25.00% HURT stats (abs) min: 2 max: 12 x̄: 7.00 x̃: 7 HURT stats (rel) min: 5.08% max: 6.67% x̄: 5.88% x̃: 5.88% 95% mean confidence interval for clauses value: -6.74 -6.08 95% mean confidence interval for clauses %-change: -25.50% -24.35% Clauses are helped. total quadwords in shared programs: 144937 -> 130686 (-9.83%) quadwords in affected programs: 140419 -> 126168 (-10.15%) helped: 1369 HURT: 13 helped stats (abs) min: 1 max: 112 x̄: 10.50 x̃: 7 helped stats (rel) min: 0.23% max: 31.82% x̄: 11.36% x̃: 10.78% HURT stats (abs) min: 1 max: 106 x̄: 10.00 x̃: 1 HURT stats (rel) min: 5.88% max: 10.24% x̄: 9.26% x̃: 10.00% 95% mean confidence interval for quadwords value: -10.96 -9.66 95% mean confidence interval for quadwords %-change: -11.52% -10.82% Quadwords are helped. total spills in shared programs: 1106 -> 705 (-36.26%) spills in affected programs: 1058 -> 657 (-37.90%) helped: 41 HURT: 0 total fills in shared programs: 2241 -> 1645 (-26.60%) fills in affected programs: 2219 -> 1623 (-26.86%) helped: 43 HURT: 2 Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8973>
2021-02-09 23:23:06 +00:00
void bi_opt_push_ubo(bi_context *ctx);
pan/bi: Reorder pushed uniforms to avoid moves On Bifrost and Valhall, push uniforms are loaded into Fast Access Uniform Random Access Memory (FAU-RAM). FAU-RAM is organized as an array of 64-bit slots. A given tuple (Bifrost) or instruction (Valhall) may access at most a single 64-bit slot. If an instruction requires uniforms from multiple 64-bit slots, a uniform-to-register move must be inserted to avoid the hazard. However, if an instruction requires a pair of 32-bit uniforms from the same 64-bit slot, no move is required. To reduce the number of moves we emit, this commit adds an optimization pass that reorders pushed uniforms, trying to group uniforms used by the same instruction. The pass works by creating a graph of pushed uniforms, where edges denote the "both 32-bit uniforms required by the same instruction" relationship. We perform depth-first search on this graph to find the connected components, where each connected component is a cluster of uniforms that are used together. We then select pairs of uniforms from each connected component. The remaining unpaired uniforms (from components of odd sizes) are paired together arbitrarily. In principle, we should weight the graph by number of occurences and choose pairs that maximize the total selected edge weight. This is left for future work, as it is nontrivial -- selecting these edges optimally appears to be NP-hard at first blush. Implementation note: As position and varying shaders share FAU on Bifrost, extra care is taken with a `push_offset` shader stage info parameter that ensures varying shaders do not reorder uniforms selected by the previous position shader. total instructions in shared programs: 2503343 -> 2451758 (-2.06%) instructions in affected programs: 1553309 -> 1501724 (-3.32%) helped: 14256 HURT: 8 helped stats (abs) min: 1.0 max: 80.0 x̄: 3.62 x̃: 3 helped stats (rel) min: 0.06% max: 36.36% x̄: 7.31% x̃: 6.67% HURT stats (abs) min: 1.0 max: 2.0 x̄: 1.38 x̃: 1 HURT stats (rel) min: 1.30% max: 12.50% x̄: 4.99% x̃: 3.85% 95% mean confidence interval for instructions value: -3.66 -3.58 95% mean confidence interval for instructions %-change: -7.41% -7.20% Instructions are helped. total tuples in shared programs: 2008399 -> 1969627 (-1.93%) tuples in affected programs: 1146344 -> 1107572 (-3.38%) helped: 12867 HURT: 147 helped stats (abs) min: 1.0 max: 61.0 x̄: 3.03 x̃: 2 helped stats (rel) min: 0.17% max: 42.86% x̄: 6.79% x̃: 4.65% HURT stats (abs) min: 1.0 max: 3.0 x̄: 1.20 x̃: 1 HURT stats (rel) min: 0.29% max: 20.00% x̄: 2.12% x̃: 1.19% 95% mean confidence interval for tuples value: -3.03 -2.93 95% mean confidence interval for tuples %-change: -6.82% -6.57% Tuples are helped. total clauses in shared programs: 408005 -> 401708 (-1.54%) clauses in affected programs: 90760 -> 84463 (-6.94%) helped: 6006 HURT: 164 helped stats (abs) min: 1.0 max: 9.0 x̄: 1.08 x̃: 1 helped stats (rel) min: 0.45% max: 33.33% x̄: 12.44% x̃: 14.29% HURT stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 HURT stats (rel) min: 1.64% max: 25.00% x̄: 9.81% x̃: 5.26% 95% mean confidence interval for clauses value: -1.03 -1.01 95% mean confidence interval for clauses %-change: -12.03% -11.66% Clauses are helped. total cycles in shared programs: 203308.37 -> 202737.83 (-0.28%) cycles in affected programs: 19264.71 -> 18694.17 (-2.96%) helped: 3024 HURT: 41 helped stats (abs) min: 0.041665999999999315 max: 2.5416680000000014 x̄: 0.19 x̃: 0 helped stats (rel) min: 0.17% max: 33.33% x̄: 3.83% x̃: 2.83% HURT stats (abs) min: 0.041665999999999315 max: 0.125 x̄: 0.06 x̃: 0 HURT stats (rel) min: 0.30% max: 5.88% x̄: 1.41% x̃: 0.93% 95% mean confidence interval for cycles value: -0.19 -0.18 95% mean confidence interval for cycles %-change: -3.89% -3.64% Cycles are helped. total arith in shared programs: 76265.67 -> 74669.25 (-2.09%) arith in affected programs: 45001.50 -> 43405.08 (-3.55%) helped: 12945 HURT: 97 helped stats (abs) min: 0.041665999999999315 max: 2.5416680000000014 x̄: 0.12 x̃: 0 helped stats (rel) min: 0.17% max: 50.00% x̄: 8.06% x̃: 4.88% HURT stats (abs) min: 0.041665999999999315 max: 0.125 x̄: 0.05 x̃: 0 HURT stats (rel) min: 0.21% max: 33.33% x̄: 2.16% x̃: 0.96% 95% mean confidence interval for arith value: -0.12 -0.12 95% mean confidence interval for arith %-change: -8.16% -7.81% Arith are helped. total quadwords in shared programs: 1796563 -> 1766803 (-1.66%) quadwords in affected programs: 948830 -> 919070 (-3.14%) helped: 12078 HURT: 219 helped stats (abs) min: 1.0 max: 42.0 x̄: 2.49 x̃: 2 helped stats (rel) min: 0.10% max: 33.33% x̄: 5.57% x̃: 5.26% HURT stats (abs) min: 1.0 max: 4.0 x̄: 1.21 x̃: 1 HURT stats (rel) min: 0.33% max: 6.67% x̄: 2.00% x̃: 1.14% 95% mean confidence interval for quadwords value: -2.46 -2.38 95% mean confidence interval for quadwords %-change: -5.52% -5.36% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14163>
2021-12-11 17:54:01 +00:00
void bi_opt_reorder_push(bi_context *ctx);
void bi_lower_swizzle(bi_context *ctx);
void bi_lower_fau(bi_context *ctx);
void bi_assign_scoreboard(bi_context *ctx);
void bi_register_allocate(bi_context *ctx);
void va_optimize(bi_context *ctx);
void va_lower_split_64bit(bi_context *ctx);
void bi_lower_opt_instruction(bi_instr *I);
pan/bi: Schedule for pressure pre-RA Add a bottom-up pre-RA list scheduler that aims to reduce register pressure, roughly the same as we use on Midgard to great effect. It uses a simple heuristic: greedily select instructions that have reduce liveness. To avoid regressions, the algorithm throws away schedules that increase maximum number of lives (used as an estimate of register pressure -- if we had SSA form, this would be exact). We might be better off using Sarkar. But for something I could type out in an afternoon, I'll happily accept a >50% reduction in spills. Instruction count is regressed due to extra moves around the blend shader ABI in some cases, at least on Bifrost this is mostly hidden by the clause scheduler. Thread count and spills/fills are both much improved here. There are numerous opportunities for future improvements to pre-RA scheduling: * Better heuristics? (Something more global than liveness alone) * Reducing false dependencies with memory access * Improve ILP for message-passing instructions? This is a tradeoff. * Simplify the code if we have SSA in the future. But for now, I think this is well worth it already. v2: Various clean-ups and memory leak fix (Icecream95). Reduce false dependencies to eliminate spilling in more shaders. shader-db stats on Mali-G52: total instructions in shared programs: 2438841 -> 2439698 (0.04%) instructions in affected programs: 1206421 -> 1207278 (0.07%) helped: 3113 HURT: 4011 helped stats (abs) min: 1.0 max: 50.0 x̄: 3.25 x̃: 2 helped stats (rel) min: 0.13% max: 44.83% x̄: 4.09% x̃: 2.11% HURT stats (abs) min: 1.0 max: 18.0 x̄: 2.73 x̃: 2 HURT stats (rel) min: 0.11% max: 57.14% x̄: 3.86% x̃: 2.07% 95% mean confidence interval for instructions value: 0.02 0.22 95% mean confidence interval for instructions %-change: 0.23% 0.54% Instructions are HURT. total tuples in shared programs: 1927077 -> 1946583 (1.01%) tuples in affected programs: 1118627 -> 1138133 (1.74%) helped: 2874 HURT: 6295 helped stats (abs) min: 1.0 max: 82.0 x̄: 3.51 x̃: 2 helped stats (rel) min: 0.17% max: 33.33% x̄: 4.60% x̃: 3.57% HURT stats (abs) min: 1.0 max: 47.0 x̄: 4.70 x̃: 3 HURT stats (rel) min: 0.20% max: 50.00% x̄: 5.16% x̃: 4.32% 95% mean confidence interval for tuples value: 2.00 2.25 95% mean confidence interval for tuples %-change: 1.97% 2.23% Tuples are HURT. total clauses in shared programs: 356053 -> 357793 (0.49%) clauses in affected programs: 151578 -> 153318 (1.15%) helped: 2196 HURT: 3813 helped stats (abs) min: 1.0 max: 49.0 x̄: 2.16 x̃: 1 helped stats (rel) min: 0.18% max: 69.01% x̄: 10.26% x̃: 8.33% HURT stats (abs) min: 1.0 max: 25.0 x̄: 1.70 x̃: 1 HURT stats (rel) min: 0.57% max: 66.67% x̄: 10.64% x̃: 8.33% 95% mean confidence interval for clauses value: 0.22 0.36 95% mean confidence interval for clauses %-change: 2.68% 3.33% Clauses are HURT. total cycles in shared programs: 167761.17 -> 167922.04 (0.10%) cycles in affected programs: 24494.21 -> 24655.08 (0.66%) helped: 862 HURT: 3054 helped stats (abs) min: 0.041665999999999315 max: 53.0 x̄: 0.69 x̃: 0 helped stats (rel) min: 0.28% max: 76.81% x̄: 5.65% x̃: 3.03% HURT stats (abs) min: 0.041665999999999315 max: 2.0416659999999993 x̄: 0.25 x̃: 0 HURT stats (rel) min: 0.26% max: 41.18% x̄: 4.91% x̃: 3.92% 95% mean confidence interval for cycles value: -0.04 0.12 95% mean confidence interval for cycles %-change: 2.36% 2.81% Inconclusive result (value mean confidence interval includes 0). total arith in shared programs: 73875.37 -> 74393.17 (0.70%) arith in affected programs: 43142.42 -> 43660.21 (1.20%) helped: 3632 HURT: 5443 helped stats (abs) min: 0.041665999999999315 max: 1.2083360000000027 x̄: 0.15 x̃: 0 helped stats (rel) min: 0.22% max: 100.00% x̄: 6.70% x̃: 4.76% HURT stats (abs) min: 0.041665999999999315 max: 2.0416659999999993 x̄: 0.19 x̃: 0 HURT stats (rel) min: 0.00% max: 166.67% x̄: 5.91% x̃: 4.08% 95% mean confidence interval for arith value: 0.05 0.06 95% mean confidence interval for arith %-change: 0.65% 1.07% Arith are HURT. total texture in shared programs: 11936 -> 11936 (0.00%) texture in affected programs: 0 -> 0 helped: 0 HURT: 0 total vary in shared programs: 4180.88 -> 4180.88 (0.00%) vary in affected programs: 0 -> 0 helped: 0 HURT: 0 total ldst in shared programs: 137551 -> 137028 (-0.38%) ldst in affected programs: 834 -> 311 (-62.71%) helped: 13 HURT: 0 helped stats (abs) min: 15.0 max: 53.0 x̄: 40.23 x̃: 53 helped stats (rel) min: 19.15% max: 100.00% x̄: 68.11% x̃: 76.81% 95% mean confidence interval for ldst value: -50.49 -29.98 95% mean confidence interval for ldst %-change: -84.37% -51.84% Ldst are helped. total quadwords in shared programs: 1684883 -> 1692021 (0.42%) quadwords in affected programs: 949463 -> 956601 (0.75%) helped: 3981 HURT: 5098 helped stats (abs) min: 1.0 max: 86.0 x̄: 3.53 x̃: 3 helped stats (rel) min: 0.18% max: 33.33% x̄: 5.82% x̃: 4.48% HURT stats (abs) min: 1.0 max: 50.0 x̄: 4.15 x̃: 3 HURT stats (rel) min: 0.17% max: 50.00% x̄: 5.11% x̃: 3.85% 95% mean confidence interval for quadwords value: 0.67 0.90 95% mean confidence interval for quadwords %-change: 0.17% 0.47% Quadwords are HURT. total threads in shared programs: 53276 -> 53653 (0.71%) threads in affected programs: 581 -> 958 (64.89%) helped: 445 HURT: 68 helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00% HURT stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 HURT stats (rel) min: 50.00% max: 50.00% x̄: 50.00% x̃: 50.00% 95% mean confidence interval for threads value: 0.68 0.79 95% mean confidence interval for threads %-change: 75.70% 84.53% Threads are helped. total preloads in shared programs: 116312 -> 116312 (0.00%) preloads in affected programs: 0 -> 0 helped: 0 HURT: 0 total loops in shared programs: 128 -> 128 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total spills in shared programs: 92 -> 37 (-59.78%) spills in affected programs: 55 -> 0 helped: 13 HURT: 0 total fills in shared programs: 658 -> 190 (-71.12%) fills in affected programs: 468 -> 0 helped: 13 HURT: 0 Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16378>
2022-05-06 18:49:30 +01:00
void bi_pressure_schedule(bi_context *ctx);
void bi_schedule(bi_context *ctx);
bool bi_can_fma(bi_instr *ins);
bool bi_can_add(bi_instr *ins);
bool bi_must_message(bi_instr *ins);
bool bi_reads_zero(bi_instr *ins);
bool bi_reads_temps(bi_instr *ins, unsigned src);
bool bi_reads_t(bi_instr *ins, unsigned src);
#ifndef NDEBUG
bool bi_validate_initialization(bi_context *ctx);
void bi_validate(bi_context *ctx, const char *after_str);
#else
static inline bool bi_validate_initialization(UNUSED bi_context *ctx) { return true; }
static inline void bi_validate(UNUSED bi_context *ctx, UNUSED const char *after_str) { return; }
#endif
uint32_t bi_fold_constant(bi_instr *I, bool *unsupported);
bool bi_opt_constant_fold(bi_context *ctx);
/* Liveness */
void bi_compute_liveness(bi_context *ctx);
void bi_liveness_ins_update(uint8_t *live, bi_instr *ins, unsigned max);
void bi_postra_liveness(bi_context *ctx);
uint64_t MUST_CHECK bi_postra_liveness_ins(uint64_t live, bi_instr *ins);
/* Layout */
signed bi_block_offset(bi_context *ctx, bi_clause *start, bi_block *target);
bool bi_ec0_packed(unsigned tuple_count);
/* Check if there are no more instructions starting with a given block, this
* needs to recurse in case a shader ends with multiple empty blocks */
static inline bool
bi_is_terminal_block(bi_block *block)
{
return (block == NULL) ||
(list_is_empty(&block->instructions) &&
bi_is_terminal_block(block->successors[0]) &&
bi_is_terminal_block(block->successors[1]));
}
/* Code emit */
/* Returns the size of the final clause */
unsigned bi_pack(bi_context *ctx, struct util_dynarray *emission);
void bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission);
struct bi_packed_tuple {
uint64_t lo;
uint64_t hi;
};
uint8_t bi_pack_literal(enum bi_clause_subword literal);
uint8_t
bi_pack_upper(enum bi_clause_subword upper,
struct bi_packed_tuple *tuples,
ASSERTED unsigned tuple_count);
uint64_t
bi_pack_tuple_bits(enum bi_clause_subword idx,
struct bi_packed_tuple *tuples,
ASSERTED unsigned tuple_count,
unsigned offset, unsigned nbits);
uint8_t
bi_pack_sync(enum bi_clause_subword t1,
enum bi_clause_subword t2,
enum bi_clause_subword t3,
struct bi_packed_tuple *tuples,
ASSERTED unsigned tuple_count,
bool z);
void
bi_pack_format(struct util_dynarray *emission,
unsigned index,
struct bi_packed_tuple *tuples,
ASSERTED unsigned tuple_count,
uint64_t header, uint64_t ec0,
unsigned m0, bool z);
unsigned bi_pack_fma(bi_instr *I,
enum bifrost_packed_src src0,
enum bifrost_packed_src src1,
enum bifrost_packed_src src2,
enum bifrost_packed_src src3);
unsigned bi_pack_add(bi_instr *I,
enum bifrost_packed_src src0,
enum bifrost_packed_src src1,
enum bifrost_packed_src src2,
enum bifrost_packed_src src3);
/* Like in NIR, for use with the builder */
enum bi_cursor_option {
bi_cursor_after_block,
bi_cursor_before_instr,
bi_cursor_after_instr
};
typedef struct {
enum bi_cursor_option option;
union {
bi_block *block;
bi_instr *instr;
};
} bi_cursor;
static inline bi_cursor
bi_after_block(bi_block *block)
{
return (bi_cursor) {
.option = bi_cursor_after_block,
.block = block
};
}
static inline bi_cursor
bi_before_instr(bi_instr *instr)
{
return (bi_cursor) {
.option = bi_cursor_before_instr,
.instr = instr
};
}
static inline bi_cursor
bi_after_instr(bi_instr *instr)
{
return (bi_cursor) {
.option = bi_cursor_after_instr,
.instr = instr
};
}
static inline bi_cursor
bi_before_nonempty_block(bi_block *block)
{
bi_instr *I = list_first_entry(&block->instructions, bi_instr, link);
assert(I != NULL);
return bi_before_instr(I);
}
static inline bi_cursor
bi_before_block(bi_block *block)
{
if (list_is_empty(&block->instructions))
return bi_after_block(block);
else
return bi_before_nonempty_block(block);
}
/* Invariant: a tuple must be nonempty UNLESS it is the last tuple of a clause,
* in which case there must exist a nonempty penultimate tuple */
ATTRIBUTE_RETURNS_NONNULL static inline bi_instr *
bi_first_instr_in_tuple(bi_tuple *tuple)
{
bi_instr *instr = tuple->fma ?: tuple->add;
assert(instr != NULL);
return instr;
}
ATTRIBUTE_RETURNS_NONNULL static inline bi_instr *
bi_first_instr_in_clause(bi_clause *clause)
{
return bi_first_instr_in_tuple(&clause->tuples[0]);
}
ATTRIBUTE_RETURNS_NONNULL static inline bi_instr *
bi_last_instr_in_clause(bi_clause *clause)
{
bi_tuple tuple = clause->tuples[clause->tuple_count - 1];
bi_instr *instr = tuple.add ?: tuple.fma;
if (!instr) {
assert(clause->tuple_count >= 2);
tuple = clause->tuples[clause->tuple_count - 2];
instr = tuple.add ?: tuple.fma;
}
assert(instr != NULL);
return instr;
}
/* Implemented by expanding bi_foreach_instr_in_block_from(_rev) with the start
* (end) of the clause and adding a condition for the clause boundary */
#define bi_foreach_instr_in_clause(block, clause, pos) \
for (bi_instr *pos = list_entry(bi_first_instr_in_clause(clause), bi_instr, link); \
(&pos->link != &(block)->instructions) \
&& (pos != bi_next_op(bi_last_instr_in_clause(clause))); \
pos = list_entry(pos->link.next, bi_instr, link))
#define bi_foreach_instr_in_clause_rev(block, clause, pos) \
for (bi_instr *pos = list_entry(bi_last_instr_in_clause(clause), bi_instr, link); \
(&pos->link != &(block)->instructions) \
&& pos != bi_prev_op(bi_first_instr_in_clause(clause)); \
pos = list_entry(pos->link.prev, bi_instr, link))
static inline bi_cursor
bi_before_clause(bi_clause *clause)
{
return bi_before_instr(bi_first_instr_in_clause(clause));
}
static inline bi_cursor
bi_before_tuple(bi_tuple *tuple)
{
return bi_before_instr(bi_first_instr_in_tuple(tuple));
}
static inline bi_cursor
bi_after_clause(bi_clause *clause)
{
return bi_after_instr(bi_last_instr_in_clause(clause));
}
/* IR builder in terms of cursor infrastructure */
typedef struct {
bi_context *shader;
bi_cursor cursor;
} bi_builder;
static inline bi_builder
bi_init_builder(bi_context *ctx, bi_cursor cursor)
{
return (bi_builder) {
.shader = ctx,
.cursor = cursor
};
}
/* Insert an instruction at the cursor and move the cursor */
static inline void
bi_builder_insert(bi_cursor *cursor, bi_instr *I)
{
switch (cursor->option) {
case bi_cursor_after_instr:
list_add(&I->link, &cursor->instr->link);
cursor->instr = I;
return;
case bi_cursor_after_block:
list_addtail(&I->link, &cursor->block->instructions);
cursor->option = bi_cursor_after_instr;
cursor->instr = I;
return;
case bi_cursor_before_instr:
list_addtail(&I->link, &cursor->instr->link);
cursor->option = bi_cursor_after_instr;
cursor->instr = I;
return;
}
unreachable("Invalid cursor option");
}
/* Read back power-efficent garbage, TODO maybe merge with null? */
static inline bi_index
bi_dontcare(bi_builder *b)
{
if (b->shader->arch >= 9)
return bi_zero();
else
return bi_passthrough(BIFROST_SRC_FAU_HI);
}
#define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx)
#define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index)
#define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index)
#define bi_worklist_peek_head(w) u_worklist_peek_head(w, bi_block, index)
#define bi_worklist_pop_head(w) u_worklist_pop_head( w, bi_block, index)
#define bi_worklist_peek_tail(w) u_worklist_peek_tail(w, bi_block, index)
#define bi_worklist_pop_tail(w) u_worklist_pop_tail( w, bi_block, index)
/* NIR passes */
bool bi_lower_divergent_indirects(nir_shader *shader, unsigned lanes);
#ifdef __cplusplus
} /* extern C */
#endif
#endif