pan/bi: Lift split/collect cache from AGX
Design based on ACO (and fruitful discussions with Daniel). Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16585>
This commit is contained in:
parent
8fdb01b96f
commit
7bfaa119f4
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2020 Collabora Ltd.
|
||||
* Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -116,6 +117,144 @@ bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
|
|||
b->shader->current_block->unconditional_jumps = true;
|
||||
}
|
||||
|
||||
/* Builds a 64-bit hash table key for an index */
|
||||
static uint64_t
|
||||
bi_index_to_key(bi_index idx)
|
||||
{
|
||||
static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
|
||||
|
||||
uint64_t key = 0;
|
||||
memcpy(&key, &idx, sizeof(idx));
|
||||
return key;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract a single channel out of a vector source. We split vectors with SPLIT
|
||||
* so we can use the split components directly, without emitting an extract.
|
||||
* This has advantages of RA, as the split can usually be optimized away.
|
||||
*/
|
||||
static bi_index
|
||||
bi_extract(bi_builder *b, bi_index vec, unsigned channel)
|
||||
{
|
||||
/* Extract caching relies on SSA form. It is incorrect for nir_register.
|
||||
* Bypass the cache and emit an explicit split for registers.
|
||||
*/
|
||||
if (vec.reg) {
|
||||
bi_instr *I = bi_split_i32_to(b, bi_null(), vec);
|
||||
I->nr_dests = channel + 1;
|
||||
I->dest[channel] = bi_temp(b->shader);
|
||||
return I->dest[channel];
|
||||
}
|
||||
|
||||
bi_index *components =
|
||||
_mesa_hash_table_u64_search(b->shader->allocated_vec,
|
||||
bi_index_to_key(vec));
|
||||
|
||||
/* No extract needed for scalars.
|
||||
*
|
||||
* This is a bit imprecise, but actual bugs (missing splits for vectors)
|
||||
* should be caught by the following assertion. It is too difficult to
|
||||
* ensure bi_extract is only called for real vectors.
|
||||
*/
|
||||
if (components == NULL && channel == 0)
|
||||
return vec;
|
||||
|
||||
assert(components != NULL && "missing bi_cache_collect()");
|
||||
return components[channel];
|
||||
}
|
||||
|
||||
static void
|
||||
bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
|
||||
{
|
||||
/* Lifetime of a hash table entry has to be at least as long as the table */
|
||||
bi_index *channels = ralloc_array(b->shader, bi_index, n);
|
||||
memcpy(channels, s, sizeof(bi_index) * n);
|
||||
|
||||
_mesa_hash_table_u64_insert(b->shader->allocated_vec,
|
||||
bi_index_to_key(dst), channels);
|
||||
}
|
||||
|
||||
/*
|
||||
* Splits an n-component vector (vec) into n scalar destinations (dests) using a
|
||||
* split pseudo-instruction.
|
||||
*
|
||||
* Pre-condition: dests is filled with bi_null().
|
||||
*/
|
||||
static void
|
||||
bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
|
||||
{
|
||||
/* Setup the destinations */
|
||||
for (unsigned i = 0; i < n; ++i) {
|
||||
dests[i] = bi_temp(b->shader);
|
||||
}
|
||||
|
||||
/* Emit the split */
|
||||
if (n == 1) {
|
||||
bi_mov_i32_to(b, dests[0], vec);
|
||||
} else {
|
||||
bi_instr *I = bi_split_i32_to(b, dests[0], vec);
|
||||
I->nr_dests = n;
|
||||
|
||||
for (unsigned j = 1; j < n; ++j)
|
||||
I->dest[j] = dests[j];
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
|
||||
{
|
||||
bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
|
||||
bi_emit_split_i32(b, dests, vec, n);
|
||||
bi_cache_collect(b, vec, dests, n);
|
||||
}
|
||||
|
||||
/*
|
||||
* Emit and cache a split for a vector of a given bitsize. The vector may not be
|
||||
* composed of 32-bit words, but it will be split at 32-bit word boundaries.
|
||||
*/
|
||||
static void
|
||||
bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
|
||||
{
|
||||
bi_emit_cached_split(b, vec, DIV_ROUND_UP(bits, 32));
|
||||
}
|
||||
|
||||
static bi_instr *
|
||||
bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
|
||||
{
|
||||
/* Special case: COLLECT of a single value is a scalar move */
|
||||
if (n == 1)
|
||||
return bi_mov_i32_to(b, dst, chan[0]);
|
||||
|
||||
bi_instr *I = bi_collect_i32_to(b, dst);
|
||||
I->nr_srcs = n;
|
||||
|
||||
for (unsigned i = 0; i < n; ++i)
|
||||
I->src[i] = chan[i];
|
||||
|
||||
bi_cache_collect(b, dst, chan, n);
|
||||
return I;
|
||||
}
|
||||
|
||||
static bi_instr *
|
||||
bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
|
||||
{
|
||||
return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2);
|
||||
}
|
||||
|
||||
static bi_instr *
|
||||
bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
|
||||
{
|
||||
return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3);
|
||||
}
|
||||
|
||||
static bi_index
|
||||
bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
|
||||
{
|
||||
bi_index dst = bi_temp(b->shader);
|
||||
bi_collect_v2i32_to(b, dst, s0, s1);
|
||||
return dst;
|
||||
}
|
||||
|
||||
static bi_index
|
||||
bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
|
||||
{
|
||||
|
@ -154,7 +293,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
|
|||
bi_index f[2];
|
||||
for (unsigned i = 0; i < 2; ++i) {
|
||||
f[i] = bi_fadd_rscale_f32(b,
|
||||
bi_word(offset, i),
|
||||
bi_extract(b, offset, i),
|
||||
bi_imm_f32(0.5), bi_imm_u32(8),
|
||||
BI_SPECIAL_NONE);
|
||||
}
|
||||
|
@ -225,39 +364,6 @@ bi_make_vec_to(bi_builder *b, bi_index final_dst,
|
|||
unsigned count,
|
||||
unsigned bitsize);
|
||||
|
||||
static bi_instr *
|
||||
bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
|
||||
{
|
||||
bi_instr *I = bi_collect_i32_to(b, dst);
|
||||
|
||||
I->nr_srcs = 2;
|
||||
I->src[0] = s0;
|
||||
I->src[1] = s1;
|
||||
|
||||
return I;
|
||||
}
|
||||
|
||||
static bi_instr *
|
||||
bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
|
||||
{
|
||||
bi_instr *I = bi_collect_i32_to(b, dst);
|
||||
|
||||
I->nr_srcs = 3;
|
||||
I->src[0] = s0;
|
||||
I->src[1] = s1;
|
||||
I->src[2] = s2;
|
||||
|
||||
return I;
|
||||
}
|
||||
|
||||
static bi_index
|
||||
bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
|
||||
{
|
||||
bi_index dst = bi_temp(b->shader);
|
||||
bi_collect_v2i32_to(b, dst, s0, s1);
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* Bifrost's load instructions lack a component offset despite operating in
|
||||
* terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
|
||||
* but they may be unavoidable with separate shaders in use. To solve this, we
|
||||
|
@ -4690,6 +4796,8 @@ bi_compile_variant_nir(nir_shader *nir,
|
|||
nir_print_shader(nir, stdout);
|
||||
}
|
||||
|
||||
ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
|
||||
|
||||
nir_foreach_function(func, nir) {
|
||||
if (!func->impl)
|
||||
continue;
|
||||
|
|
|
@ -766,6 +766,11 @@ typedef struct {
|
|||
/* Mask of UBOs that need to be uploaded */
|
||||
uint32_t ubo_mask;
|
||||
|
||||
/* During instruction selection, map from vector bi_index to its scalar
|
||||
* components, populated by a split.
|
||||
*/
|
||||
struct hash_table_u64 *allocated_vec;
|
||||
|
||||
/* Stats for shader-db */
|
||||
unsigned instruction_count;
|
||||
unsigned loop_count;
|
||||
|
|
Loading…
Reference in New Issue