pan/bi: Rework varying linking on Valhall

Valhall introduces hardware-allocated varyings. Instead of allocating varying
descriptors on the CPU with a slot based interface, the driver just tells the
hardware how many bytes to allocate per vertex and loads/stores with byte
offsets. This is much nicer!

However, this requires us to rework our linking code to account for separable
shaders. With separable shaders, we can't rely on driver_location matching
between stages, and unlike on Midgard, we can't resolve the differences with
curated command stream descriptors. However, we *can* rely on slots matching. So
we should "just" determine the byte offsets based on the slot, and then
separable shaders work.

For GLES, it really is that easy.

For desktop GL, it's not -- desktop GL brings unpredictable extra varyings like
COL1 and TEX2. Allocating space for all of these unconditionally would hamper
performance. To cope, we key fragment shaders to the set of non-GLES varyings
written by the linked vertex shader. Then we may define an efficient ABI, where
only apps only pay for what they use.

Fixes various tests in dEQP-GLES31.functional.separate_shader.random.* on
Valhall.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16310>
This commit is contained in:
Alyssa Rosenzweig 2022-05-03 12:31:01 -04:00 committed by Marge Bot
parent 635d8d6bd7
commit 0fcddd4d2c
7 changed files with 123 additions and 49 deletions

View File

@ -69,6 +69,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
struct panfrost_compile_inputs inputs = {
.gpu_id = dev->gpu_id,
.shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
.fixed_varying_mask = state->key.fixed_varying_mask
};
memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats));

View File

@ -3360,9 +3360,15 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
if (secondary_shader) {
unsigned v = vs->info.varyings.output_count;
unsigned f = fs->info.varyings.input_count;
unsigned slots = MAX2(v, f);
slots += util_bitcount(fs->key.fixed_varying_mask);
unsigned size = slots * 16;
/* Assumes 16 byte slots. We could do better. */
cfg.vertex_packet_stride = vs->info.varyings.output_count * 16;
cfg.vertex_attribute_stride = fs->info.varyings.input_count * 16;
cfg.vertex_packet_stride = size + 16;
cfg.vertex_attribute_stride = size;
} else {
/* Hardware requirement for "no varyings" */
cfg.vertex_packet_stride = 16;

View File

@ -307,6 +307,13 @@ panfrost_create_shader_state(
else
so->nir = cso->ir.nir;
/* Fix linkage early */
if (so->nir->info.stage == MESA_SHADER_VERTEX) {
so->fixed_varying_mask =
(so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
}
/* Precompile for shader-db if we need to */
if (unlikely(dev->debug & PAN_DBG_PRECOMPILE)) {
struct panfrost_context *ctx = pan_context(pctx);
@ -372,6 +379,7 @@ panfrost_build_key(struct panfrost_context *ctx,
struct panfrost_device *dev = pan_device(ctx->base.screen);
struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer;
struct panfrost_shader_variants *vs = ctx->shader[MESA_SHADER_VERTEX];
key->fs.nr_cbufs = fb->nr_cbufs;
@ -398,6 +406,12 @@ panfrost_build_key(struct panfrost_context *ctx,
key->fs.rt_formats[i] = fmt;
}
}
/* Funny desktop GL varying lowering on Valhall */
if (dev->arch >= 9) {
assert(vs != NULL && "too early");
key->fixed_varying_mask = vs->fixed_varying_mask;
}
}
/**
@ -508,13 +522,20 @@ panfrost_update_shader_variant(struct panfrost_context *ctx,
if (type == PIPE_SHADER_COMPUTE)
return;
/* We need linking information, defer this */
if (type == PIPE_SHADER_FRAGMENT && !ctx->shader[PIPE_SHADER_VERTEX])
return;
/* Match the appropriate variant */
signed variant = -1;
struct panfrost_shader_variants *variants = ctx->shader[type];
simple_mtx_lock(&variants->lock);
struct panfrost_shader_key key = { 0 };
struct panfrost_shader_key key = {
.fixed_varying_mask = variants->fixed_varying_mask
};
panfrost_build_key(ctx, &key, variants->nir);
for (unsigned i = 0; i < variants->variant_count; ++i) {
@ -539,6 +560,10 @@ static void
panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso)
{
panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX);
/* Fragment shaders are linked with vertex shaders */
struct panfrost_context *ctx = pan_context(pctx);
panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
}
static void

View File

@ -272,6 +272,9 @@ struct panfrost_fs_key {
};
struct panfrost_shader_key {
/* Valhall needs special handling for desktop GL varyings */
uint32_t fixed_varying_mask;
/* If we need vertex shader keys, union it in */
struct panfrost_fs_key fs;
};
@ -315,6 +318,12 @@ struct panfrost_shader_variants {
unsigned variant_count;
/* On vertex shaders, bit mask of special desktop-only varyings to link
* with the fragment shader. Used on Valhall to implement separable
* shaders for desktop GL.
*/
uint32_t fixed_varying_mask;
/* The current active variant */
unsigned active_variant;
};

View File

@ -284,6 +284,41 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
bi_copy_component(b, instr, dest);
}
/*
* ABI: Special (desktop GL) slots come first, tightly packed. General varyings
* come later, sparsely packed. This handles both linked and separable shaders
* with a common code path, with minimal keying only for desktop GL. Each slot
* consumes 16 bytes (TODO: fp16, partial vectors).
*/
static unsigned
bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
uint32_t mask = ctx->inputs->fixed_varying_mask;
if (sem.location >= VARYING_SLOT_VAR0) {
unsigned nr_special = util_bitcount(mask);
unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
return 16 * (nr_special + general_index);
} else {
return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
}
}
/*
* Compute the offset in bytes of a varying with an immediate offset, adding the
* offset to the base computed above. Convenience method.
*/
static unsigned
bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
{
nir_src *src = nir_get_io_offset_src(intr);
assert(nir_src_is_const(*src) && "assumes immediate offset");
return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
}
static void
bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
{
@ -328,7 +363,8 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
if (b->shader->malloc_idvs && immediate) {
/* Immediate index given in bytes. */
bi_ld_var_buf_imm_f32_to(b, dest, src0, regfmt, sample, update,
vecsize, imm_index * 16);
vecsize,
bi_varying_offset(b->shader, instr));
} else if (immediate && smooth) {
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
vecsize, imm_index);
@ -339,24 +375,31 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
bi_index idx = bi_src_index(offset);
unsigned base = nir_intrinsic_base(instr);
if (base != 0)
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
if (b->shader->malloc_idvs) {
/* Index needs to be in bytes, but NIR gives the index
* in slots. For now assume 16 bytes per slots.
*
* TODO: more complex linking?
* in slots. For now assume 16 bytes per element.
*/
idx = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
bi_ld_var_buf_f32_to(b, dest, src0, idx, regfmt, sample,
update, vecsize);
bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
unsigned vbase = bi_varying_base_bytes(b->shader, instr);
if (vbase != 0)
idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
bi_ld_var_buf_f32_to(b, dest, src0, idx_bytes, regfmt,
sample, update, vecsize);
} else if (smooth) {
if (base != 0)
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
update, vecsize);
} else {
I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
regfmt, vecsize);
if (base != 0)
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
I = bi_ld_var_flat_to(b, dest, idx,
BI_FUNCTION_NONE, regfmt,
vecsize);
}
}
@ -794,39 +837,6 @@ bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
return false;
}
/**
* Computes the offset in bytes of a varying. This assumes VARYING_SLOT_POS is
* mapped to location=0 and always present. This also assumes each slot
* consumes 16 bytes, which is a worst-case (highp vec4). In the future, this
* should be optimized to support fp16 and partial vectors. There are
* nontrivial interactions with separable shaders, however.
*/
static unsigned
bi_varying_offset(nir_shader *nir, nir_intrinsic_instr *intr)
{
nir_src *offset = nir_get_io_offset_src(intr);
assert(nir_src_is_const(*offset) && "no indirect varyings on Valhall");
unsigned loc = 0;
unsigned slot = nir_intrinsic_base(intr) + nir_src_as_uint(*offset);
nir_foreach_shader_out_variable(var, nir) {
if ((var->data.location == VARYING_SLOT_POS) ||
(var->data.location == VARYING_SLOT_PSIZ))
continue;
if (var->data.driver_location > slot)
continue;
if (var->data.driver_location == slot)
return loc;
loc += 16; // todo size
}
unreachable("Unlinked variable");
}
static void
bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
{
@ -880,7 +890,7 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
bi_src_index(&instr->src[0]),
address, bi_word(address, 1),
varying ? BI_SEG_VARY : BI_SEG_POS,
varying ? bi_varying_offset(b->shader->nir, instr) : 0);
varying ? bi_varying_offset(b->shader, instr) : 0);
} else if (immediate) {
bi_index address = bi_lea_attr_imm(b,
bi_vertex_id(b), bi_instance_id(b),

View File

@ -42,6 +42,7 @@ GENX(pan_shader_get_compiler_options)(void)
#endif
}
#if PAN_ARCH <= 7
static enum pipe_format
varying_format(nir_alu_type t, unsigned ncomps)
{
@ -157,6 +158,7 @@ collect_varyings(nir_shader *s, nir_variable_mode varying_mode,
*varying_count = MAX2(*varying_count, loc + sz);
}
}
#endif
#if PAN_ARCH >= 6
static enum mali_register_file_format
@ -230,8 +232,14 @@ GENX(pan_shader_compile)(nir_shader *s,
info->vs.writes_point_size =
s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
#if PAN_ARCH >= 9
info->varyings.output_count =
util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0);
#else
collect_varyings(s, nir_var_shader_out, info->varyings.output,
&info->varyings.output_count);
#endif
break;
case MESA_SHADER_FRAGMENT:
if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
@ -286,8 +294,13 @@ GENX(pan_shader_compile)(nir_shader *s,
info->fs.reads_face =
(s->info.inputs_read & (1 << VARYING_SLOT_FACE)) ||
BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
#if PAN_ARCH >= 9
info->varyings.output_count =
util_last_bit(s->info.outputs_read >> VARYING_SLOT_VAR0);
#else
collect_varyings(s, nir_var_shader_in, info->varyings.input,
&info->varyings.input_count);
#endif
break;
case MESA_SHADER_COMPUTE:
info->wls_size = s->info.shared_size;

View File

@ -190,6 +190,16 @@ struct panfrost_compile_inputs {
uint8_t raw_fmt_mask;
unsigned nr_cbufs;
/* Used on Valhall.
*
* Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
* written by the previous stage (fragment shader) or written by this
* stage (vertex shader). Bits are slots from gl_varying_slot.
*
* For modern APIs (GLES or VK), this should be 0.
*/
uint32_t fixed_varying_mask;
union {
struct {
bool static_rt_conv;