348 lines
12 KiB
C
348 lines
12 KiB
C
/*
|
|
* Copyright (C) 2019 Collabora, Ltd.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*
|
|
* Authors (Collabora):
|
|
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
|
|
*/
|
|
|
|
#include "compiler.h"
|
|
#include "util/u_math.h"
|
|
#include "util/u_memory.h"
|
|
|
|
/* This pass promotes reads from UBOs to register-mapped uniforms. This saves
|
|
* both instructions and work register pressure, but it reduces the work
|
|
* registers available, requiring a balance.
|
|
*
|
|
* We use a heuristic to determine the ideal count, implemented by
|
|
* mir_work_heuristic, which returns the ideal number of work registers.
|
|
*/
|
|
|
|
static bool
|
|
mir_is_ubo(midgard_instruction *ins)
|
|
{
|
|
return (ins->type == TAG_LOAD_STORE_4) &&
|
|
(OP_IS_UBO_READ(ins->op));
|
|
}
|
|
|
|
static bool
|
|
mir_is_direct_aligned_ubo(midgard_instruction *ins)
|
|
{
|
|
return mir_is_ubo(ins) &&
|
|
!(ins->constants.u32[0] & 0xF) &&
|
|
(ins->src[1] == ~0) &&
|
|
(ins->src[2] == ~0);
|
|
}
|
|
|
|
/* Represents use data for a single UBO */
|
|
|
|
#define MAX_UBO_QWORDS (65536 / 16)
|
|
|
|
struct mir_ubo_block {
|
|
BITSET_DECLARE(uses, MAX_UBO_QWORDS);
|
|
BITSET_DECLARE(pushed, MAX_UBO_QWORDS);
|
|
};
|
|
|
|
struct mir_ubo_analysis {
|
|
/* Per block analysis */
|
|
unsigned nr_blocks;
|
|
struct mir_ubo_block *blocks;
|
|
};
|
|
|
|
static struct mir_ubo_analysis
|
|
mir_analyze_ranges(compiler_context *ctx)
|
|
{
|
|
struct mir_ubo_analysis res = {
|
|
.nr_blocks = ctx->nir->info.num_ubos + 1,
|
|
};
|
|
|
|
res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block));
|
|
|
|
mir_foreach_instr_global(ctx, ins) {
|
|
if (!mir_is_direct_aligned_ubo(ins)) continue;
|
|
|
|
unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
|
|
unsigned offset = ins->constants.u32[0] / 16;
|
|
|
|
assert(ubo < res.nr_blocks);
|
|
|
|
if (offset < MAX_UBO_QWORDS)
|
|
BITSET_SET(res.blocks[ubo].uses, offset);
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
/* Select UBO words to push. A sophisticated implementation would consider the
|
|
* number of uses and perhaps the control flow to estimate benefit. This is not
|
|
* sophisticated. Select from the last UBO first to prioritize sysvals. */
|
|
|
|
static void
|
|
mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords)
|
|
{
|
|
unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4);
|
|
|
|
for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
|
|
struct mir_ubo_block *block = &analysis->blocks[ubo];
|
|
|
|
unsigned vec4;
|
|
BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) {
|
|
/* Don't push more than possible */
|
|
if (push->count > max_words - 4)
|
|
return;
|
|
|
|
for (unsigned offs = 0; offs < 4; ++offs) {
|
|
struct panfrost_ubo_word word = {
|
|
.ubo = ubo,
|
|
.offset = (vec4 * 16) + (offs * 4)
|
|
};
|
|
|
|
push->words[push->count++] = word;
|
|
}
|
|
|
|
/* Mark it as pushed so we can rewrite */
|
|
BITSET_SET(block->pushed, vec4);
|
|
}
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
static void
|
|
mir_dump_ubo_analysis(struct mir_ubo_analysis *res)
|
|
{
|
|
printf("%u blocks\n", res->nr_blocks);
|
|
|
|
for (unsigned i = 0; i < res->nr_blocks; ++i) {
|
|
BITSET_WORD *uses = res->blocks[i].uses;
|
|
BITSET_WORD *push = res->blocks[i].pushed;
|
|
|
|
unsigned last = BITSET_LAST_BIT_SIZED(uses, BITSET_WORDS(MAX_UBO_QWORDS));
|
|
|
|
printf("\t");
|
|
|
|
for (unsigned j = 0; j < last; ++j) {
|
|
bool used = BITSET_TEST(uses, j);
|
|
bool pushed = BITSET_TEST(push, j);
|
|
assert(used || !pushed);
|
|
|
|
putchar(pushed ? '*' : used ? '-' : '_');
|
|
}
|
|
|
|
printf("\n");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static unsigned
|
|
mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis)
|
|
{
|
|
unsigned count = 0;
|
|
|
|
for (unsigned i = 0; i < analysis->nr_blocks; ++i) {
|
|
BITSET_WORD *uses = analysis->blocks[i].uses;
|
|
|
|
for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w)
|
|
count += util_bitcount(uses[w]);
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
static unsigned
|
|
mir_count_live(uint16_t *live, unsigned temp_count)
|
|
{
|
|
unsigned count = 0;
|
|
|
|
for (unsigned i = 0; i < temp_count; ++i)
|
|
count += util_bitcount(live[i]);
|
|
|
|
return count;
|
|
}
|
|
|
|
static unsigned
|
|
mir_estimate_pressure(compiler_context *ctx)
|
|
{
|
|
mir_invalidate_liveness(ctx);
|
|
mir_compute_liveness(ctx);
|
|
|
|
unsigned max_live = 0;
|
|
|
|
mir_foreach_block(ctx, _block) {
|
|
midgard_block *block = (midgard_block *) _block;
|
|
uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t));
|
|
|
|
mir_foreach_instr_in_block_rev(block, ins) {
|
|
unsigned count = mir_count_live(live, ctx->temp_count);
|
|
max_live = MAX2(max_live, count);
|
|
mir_liveness_ins_update(live, ins, ctx->temp_count);
|
|
}
|
|
|
|
free(live);
|
|
}
|
|
|
|
return DIV_ROUND_UP(max_live, 16);
|
|
}
|
|
|
|
static unsigned
|
|
mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis)
|
|
{
|
|
unsigned uniform_count = mir_promoteable_uniform_count(analysis);
|
|
|
|
/* If there are 8 or fewer uniforms, it doesn't matter what we do, so
|
|
* allow as many work registers as needed */
|
|
|
|
if (uniform_count <= 8)
|
|
return 16;
|
|
|
|
/* Otherwise, estimate the register pressure */
|
|
|
|
unsigned pressure = mir_estimate_pressure(ctx);
|
|
|
|
/* Prioritize not spilling above all else. The relation between the
|
|
* pressure estimate and the actual register pressure is a little
|
|
* murkier than we might like (due to scheduling, pipeline registers,
|
|
* failure to pack vector registers, load/store registers, texture
|
|
* registers...), hence why this is a heuristic parameter */
|
|
|
|
if (pressure > 6)
|
|
return 16;
|
|
|
|
/* If there's no chance of spilling, prioritize UBOs and thread count */
|
|
|
|
return 8;
|
|
}
|
|
|
|
/* Bitset of indices that will be used as a special register -- inputs to a
|
|
* non-ALU op. We precompute this set so that testing is efficient, otherwise
|
|
* we end up O(mn) behaviour for n instructions and m uniform reads */
|
|
|
|
static BITSET_WORD *
|
|
mir_special_indices(compiler_context *ctx)
|
|
{
|
|
mir_compute_temp_count(ctx);
|
|
BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD));
|
|
|
|
mir_foreach_instr_global(ctx, ins) {
|
|
/* Look for special instructions */
|
|
bool is_ldst = ins->type == TAG_LOAD_STORE_4;
|
|
bool is_tex = ins->type == TAG_TEXTURE_4;
|
|
bool is_writeout = ins->compact_branch && ins->writeout;
|
|
|
|
if (!(is_ldst || is_tex || is_writeout))
|
|
continue;
|
|
|
|
/* Anything read by a special instruction is itself special */
|
|
mir_foreach_src(ins, i) {
|
|
unsigned idx = ins->src[i];
|
|
|
|
if (idx < ctx->temp_count)
|
|
BITSET_SET(bset, idx);
|
|
}
|
|
}
|
|
|
|
return bset;
|
|
}
|
|
|
|
void
|
|
midgard_promote_uniforms(compiler_context *ctx)
|
|
{
|
|
if (ctx->inputs->no_ubo_to_push) {
|
|
/* If nothing is pushed, all UBOs need to be uploaded
|
|
* conventionally */
|
|
ctx->ubo_mask = ~0;
|
|
return;
|
|
}
|
|
|
|
struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx);
|
|
|
|
unsigned work_count = mir_work_heuristic(ctx, &analysis);
|
|
unsigned promoted_count = 24 - work_count;
|
|
|
|
/* Ensure we are 16 byte aligned to avoid underallocations */
|
|
mir_pick_ubo(&ctx->info->push, &analysis, promoted_count);
|
|
ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4);
|
|
|
|
/* First, figure out special indices a priori so we don't recompute a lot */
|
|
BITSET_WORD *special = mir_special_indices(ctx);
|
|
|
|
ctx->ubo_mask = 0;
|
|
|
|
mir_foreach_instr_global_safe(ctx, ins) {
|
|
if (!mir_is_ubo(ins)) continue;
|
|
|
|
unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store);
|
|
unsigned qword = ins->constants.u32[0] / 16;
|
|
|
|
if (!mir_is_direct_aligned_ubo(ins)) {
|
|
if (ins->src[1] == ~0)
|
|
ctx->ubo_mask |= BITSET_BIT(ubo);
|
|
else
|
|
ctx->ubo_mask = ~0;
|
|
|
|
continue;
|
|
}
|
|
|
|
/* Check if we decided to push this */
|
|
assert(ubo < analysis.nr_blocks);
|
|
if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) {
|
|
ctx->ubo_mask |= BITSET_BIT(ubo);
|
|
continue;
|
|
}
|
|
|
|
/* Find where we pushed to, TODO: unaligned pushes to pack */
|
|
unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16);
|
|
assert((base & 0x3) == 0);
|
|
|
|
unsigned address = base / 4;
|
|
unsigned uniform_reg = 23 - address;
|
|
|
|
/* Should've taken into account when pushing */
|
|
assert(address < promoted_count);
|
|
unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);
|
|
|
|
/* We do need the move for safety for a non-SSA dest, or if
|
|
* we're being fed into a special class */
|
|
|
|
bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1;
|
|
|
|
if (ins->dest < ctx->temp_count)
|
|
needs_move |= BITSET_TEST(special, ins->dest);
|
|
|
|
if (needs_move) {
|
|
unsigned type_size = nir_alu_type_get_type_size(ins->dest_type);
|
|
midgard_instruction mov = v_mov(promoted, ins->dest);
|
|
mov.dest_type = nir_type_uint | type_size;
|
|
mov.src_types[1] = mov.dest_type;
|
|
|
|
uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size);
|
|
mir_set_bytemask(&mov, rounded);
|
|
mir_insert_instruction_before(ctx, ins, mov);
|
|
} else {
|
|
mir_rewrite_index_src(ctx, ins->dest, promoted);
|
|
}
|
|
|
|
mir_remove_instruction(ins);
|
|
}
|
|
|
|
free(special);
|
|
free(analysis.blocks);
|
|
}
|