pan/bi: Rework varying linking on Valhall
Valhall introduces hardware-allocated varyings. Instead of allocating varying descriptors on the CPU with a slot based interface, the driver just tells the hardware how many bytes to allocate per vertex and loads/stores with byte offsets. This is much nicer! However, this requires us to rework our linking code to account for separable shaders. With separable shaders, we can't rely on driver_location matching between stages, and unlike on Midgard, we can't resolve the differences with curated command stream descriptors. However, we *can* rely on slots matching. So we should "just" determine the byte offsets based on the slot, and then separable shaders work. For GLES, it really is that easy. For desktop GL, it's not -- desktop GL brings unpredictable extra varyings like COL1 and TEX2. Allocating space for all of these unconditionally would hamper performance. To cope, we key fragment shaders to the set of non-GLES varyings written by the linked vertex shader. Then we may define an efficient ABI, where only apps only pay for what they use. Fixes various tests in dEQP-GLES31.functional.separate_shader.random.* on Valhall. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16310>
This commit is contained in:
parent
635d8d6bd7
commit
0fcddd4d2c
|
@ -69,6 +69,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
|
||||||
struct panfrost_compile_inputs inputs = {
|
struct panfrost_compile_inputs inputs = {
|
||||||
.gpu_id = dev->gpu_id,
|
.gpu_id = dev->gpu_id,
|
||||||
.shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
|
.shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
|
||||||
|
.fixed_varying_mask = state->key.fixed_varying_mask
|
||||||
};
|
};
|
||||||
|
|
||||||
memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats));
|
memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats));
|
||||||
|
|
|
@ -3360,9 +3360,15 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
|
||||||
|
|
||||||
pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
|
pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
|
||||||
if (secondary_shader) {
|
if (secondary_shader) {
|
||||||
|
unsigned v = vs->info.varyings.output_count;
|
||||||
|
unsigned f = fs->info.varyings.input_count;
|
||||||
|
unsigned slots = MAX2(v, f);
|
||||||
|
slots += util_bitcount(fs->key.fixed_varying_mask);
|
||||||
|
unsigned size = slots * 16;
|
||||||
|
|
||||||
/* Assumes 16 byte slots. We could do better. */
|
/* Assumes 16 byte slots. We could do better. */
|
||||||
cfg.vertex_packet_stride = vs->info.varyings.output_count * 16;
|
cfg.vertex_packet_stride = size + 16;
|
||||||
cfg.vertex_attribute_stride = fs->info.varyings.input_count * 16;
|
cfg.vertex_attribute_stride = size;
|
||||||
} else {
|
} else {
|
||||||
/* Hardware requirement for "no varyings" */
|
/* Hardware requirement for "no varyings" */
|
||||||
cfg.vertex_packet_stride = 16;
|
cfg.vertex_packet_stride = 16;
|
||||||
|
|
|
@ -307,6 +307,13 @@ panfrost_create_shader_state(
|
||||||
else
|
else
|
||||||
so->nir = cso->ir.nir;
|
so->nir = cso->ir.nir;
|
||||||
|
|
||||||
|
/* Fix linkage early */
|
||||||
|
if (so->nir->info.stage == MESA_SHADER_VERTEX) {
|
||||||
|
so->fixed_varying_mask =
|
||||||
|
(so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
|
||||||
|
~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
|
||||||
|
}
|
||||||
|
|
||||||
/* Precompile for shader-db if we need to */
|
/* Precompile for shader-db if we need to */
|
||||||
if (unlikely(dev->debug & PAN_DBG_PRECOMPILE)) {
|
if (unlikely(dev->debug & PAN_DBG_PRECOMPILE)) {
|
||||||
struct panfrost_context *ctx = pan_context(pctx);
|
struct panfrost_context *ctx = pan_context(pctx);
|
||||||
|
@ -372,6 +379,7 @@ panfrost_build_key(struct panfrost_context *ctx,
|
||||||
struct panfrost_device *dev = pan_device(ctx->base.screen);
|
struct panfrost_device *dev = pan_device(ctx->base.screen);
|
||||||
struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
|
struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
|
||||||
struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer;
|
struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer;
|
||||||
|
struct panfrost_shader_variants *vs = ctx->shader[MESA_SHADER_VERTEX];
|
||||||
|
|
||||||
key->fs.nr_cbufs = fb->nr_cbufs;
|
key->fs.nr_cbufs = fb->nr_cbufs;
|
||||||
|
|
||||||
|
@ -398,6 +406,12 @@ panfrost_build_key(struct panfrost_context *ctx,
|
||||||
key->fs.rt_formats[i] = fmt;
|
key->fs.rt_formats[i] = fmt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Funny desktop GL varying lowering on Valhall */
|
||||||
|
if (dev->arch >= 9) {
|
||||||
|
assert(vs != NULL && "too early");
|
||||||
|
key->fixed_varying_mask = vs->fixed_varying_mask;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -508,13 +522,20 @@ panfrost_update_shader_variant(struct panfrost_context *ctx,
|
||||||
if (type == PIPE_SHADER_COMPUTE)
|
if (type == PIPE_SHADER_COMPUTE)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/* We need linking information, defer this */
|
||||||
|
if (type == PIPE_SHADER_FRAGMENT && !ctx->shader[PIPE_SHADER_VERTEX])
|
||||||
|
return;
|
||||||
|
|
||||||
/* Match the appropriate variant */
|
/* Match the appropriate variant */
|
||||||
signed variant = -1;
|
signed variant = -1;
|
||||||
struct panfrost_shader_variants *variants = ctx->shader[type];
|
struct panfrost_shader_variants *variants = ctx->shader[type];
|
||||||
|
|
||||||
simple_mtx_lock(&variants->lock);
|
simple_mtx_lock(&variants->lock);
|
||||||
|
|
||||||
struct panfrost_shader_key key = { 0 };
|
struct panfrost_shader_key key = {
|
||||||
|
.fixed_varying_mask = variants->fixed_varying_mask
|
||||||
|
};
|
||||||
|
|
||||||
panfrost_build_key(ctx, &key, variants->nir);
|
panfrost_build_key(ctx, &key, variants->nir);
|
||||||
|
|
||||||
for (unsigned i = 0; i < variants->variant_count; ++i) {
|
for (unsigned i = 0; i < variants->variant_count; ++i) {
|
||||||
|
@ -539,6 +560,10 @@ static void
|
||||||
panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso)
|
panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso)
|
||||||
{
|
{
|
||||||
panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX);
|
panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX);
|
||||||
|
|
||||||
|
/* Fragment shaders are linked with vertex shaders */
|
||||||
|
struct panfrost_context *ctx = pan_context(pctx);
|
||||||
|
panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
|
@ -272,6 +272,9 @@ struct panfrost_fs_key {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct panfrost_shader_key {
|
struct panfrost_shader_key {
|
||||||
|
/* Valhall needs special handling for desktop GL varyings */
|
||||||
|
uint32_t fixed_varying_mask;
|
||||||
|
|
||||||
/* If we need vertex shader keys, union it in */
|
/* If we need vertex shader keys, union it in */
|
||||||
struct panfrost_fs_key fs;
|
struct panfrost_fs_key fs;
|
||||||
};
|
};
|
||||||
|
@ -315,6 +318,12 @@ struct panfrost_shader_variants {
|
||||||
|
|
||||||
unsigned variant_count;
|
unsigned variant_count;
|
||||||
|
|
||||||
|
/* On vertex shaders, bit mask of special desktop-only varyings to link
|
||||||
|
* with the fragment shader. Used on Valhall to implement separable
|
||||||
|
* shaders for desktop GL.
|
||||||
|
*/
|
||||||
|
uint32_t fixed_varying_mask;
|
||||||
|
|
||||||
/* The current active variant */
|
/* The current active variant */
|
||||||
unsigned active_variant;
|
unsigned active_variant;
|
||||||
};
|
};
|
||||||
|
|
|
@ -284,6 +284,41 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
|
||||||
bi_copy_component(b, instr, dest);
|
bi_copy_component(b, instr, dest);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ABI: Special (desktop GL) slots come first, tightly packed. General varyings
|
||||||
|
* come later, sparsely packed. This handles both linked and separable shaders
|
||||||
|
* with a common code path, with minimal keying only for desktop GL. Each slot
|
||||||
|
* consumes 16 bytes (TODO: fp16, partial vectors).
|
||||||
|
*/
|
||||||
|
static unsigned
|
||||||
|
bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
|
||||||
|
{
|
||||||
|
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
|
||||||
|
uint32_t mask = ctx->inputs->fixed_varying_mask;
|
||||||
|
|
||||||
|
if (sem.location >= VARYING_SLOT_VAR0) {
|
||||||
|
unsigned nr_special = util_bitcount(mask);
|
||||||
|
unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
|
||||||
|
|
||||||
|
return 16 * (nr_special + general_index);
|
||||||
|
} else {
|
||||||
|
return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compute the offset in bytes of a varying with an immediate offset, adding the
|
||||||
|
* offset to the base computed above. Convenience method.
|
||||||
|
*/
|
||||||
|
static unsigned
|
||||||
|
bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
|
||||||
|
{
|
||||||
|
nir_src *src = nir_get_io_offset_src(intr);
|
||||||
|
assert(nir_src_is_const(*src) && "assumes immediate offset");
|
||||||
|
|
||||||
|
return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
||||||
{
|
{
|
||||||
|
@ -328,7 +363,8 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
||||||
if (b->shader->malloc_idvs && immediate) {
|
if (b->shader->malloc_idvs && immediate) {
|
||||||
/* Immediate index given in bytes. */
|
/* Immediate index given in bytes. */
|
||||||
bi_ld_var_buf_imm_f32_to(b, dest, src0, regfmt, sample, update,
|
bi_ld_var_buf_imm_f32_to(b, dest, src0, regfmt, sample, update,
|
||||||
vecsize, imm_index * 16);
|
vecsize,
|
||||||
|
bi_varying_offset(b->shader, instr));
|
||||||
} else if (immediate && smooth) {
|
} else if (immediate && smooth) {
|
||||||
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
|
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
|
||||||
vecsize, imm_index);
|
vecsize, imm_index);
|
||||||
|
@ -339,24 +375,31 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
||||||
bi_index idx = bi_src_index(offset);
|
bi_index idx = bi_src_index(offset);
|
||||||
unsigned base = nir_intrinsic_base(instr);
|
unsigned base = nir_intrinsic_base(instr);
|
||||||
|
|
||||||
if (base != 0)
|
|
||||||
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
|
|
||||||
|
|
||||||
if (b->shader->malloc_idvs) {
|
if (b->shader->malloc_idvs) {
|
||||||
/* Index needs to be in bytes, but NIR gives the index
|
/* Index needs to be in bytes, but NIR gives the index
|
||||||
* in slots. For now assume 16 bytes per slots.
|
* in slots. For now assume 16 bytes per element.
|
||||||
*
|
|
||||||
* TODO: more complex linking?
|
|
||||||
*/
|
*/
|
||||||
idx = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
|
bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
|
||||||
bi_ld_var_buf_f32_to(b, dest, src0, idx, regfmt, sample,
|
unsigned vbase = bi_varying_base_bytes(b->shader, instr);
|
||||||
update, vecsize);
|
|
||||||
|
if (vbase != 0)
|
||||||
|
idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
|
||||||
|
|
||||||
|
bi_ld_var_buf_f32_to(b, dest, src0, idx_bytes, regfmt,
|
||||||
|
sample, update, vecsize);
|
||||||
} else if (smooth) {
|
} else if (smooth) {
|
||||||
|
if (base != 0)
|
||||||
|
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
|
||||||
|
|
||||||
I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
|
I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
|
||||||
update, vecsize);
|
update, vecsize);
|
||||||
} else {
|
} else {
|
||||||
I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
|
if (base != 0)
|
||||||
regfmt, vecsize);
|
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
|
||||||
|
|
||||||
|
I = bi_ld_var_flat_to(b, dest, idx,
|
||||||
|
BI_FUNCTION_NONE, regfmt,
|
||||||
|
vecsize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -794,39 +837,6 @@ bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Computes the offset in bytes of a varying. This assumes VARYING_SLOT_POS is
|
|
||||||
* mapped to location=0 and always present. This also assumes each slot
|
|
||||||
* consumes 16 bytes, which is a worst-case (highp vec4). In the future, this
|
|
||||||
* should be optimized to support fp16 and partial vectors. There are
|
|
||||||
* nontrivial interactions with separable shaders, however.
|
|
||||||
*/
|
|
||||||
static unsigned
|
|
||||||
bi_varying_offset(nir_shader *nir, nir_intrinsic_instr *intr)
|
|
||||||
{
|
|
||||||
nir_src *offset = nir_get_io_offset_src(intr);
|
|
||||||
assert(nir_src_is_const(*offset) && "no indirect varyings on Valhall");
|
|
||||||
|
|
||||||
unsigned loc = 0;
|
|
||||||
unsigned slot = nir_intrinsic_base(intr) + nir_src_as_uint(*offset);
|
|
||||||
|
|
||||||
nir_foreach_shader_out_variable(var, nir) {
|
|
||||||
if ((var->data.location == VARYING_SLOT_POS) ||
|
|
||||||
(var->data.location == VARYING_SLOT_PSIZ))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (var->data.driver_location > slot)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (var->data.driver_location == slot)
|
|
||||||
return loc;
|
|
||||||
|
|
||||||
loc += 16; // todo size
|
|
||||||
}
|
|
||||||
|
|
||||||
unreachable("Unlinked variable");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
||||||
{
|
{
|
||||||
|
@ -880,7 +890,7 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
||||||
bi_src_index(&instr->src[0]),
|
bi_src_index(&instr->src[0]),
|
||||||
address, bi_word(address, 1),
|
address, bi_word(address, 1),
|
||||||
varying ? BI_SEG_VARY : BI_SEG_POS,
|
varying ? BI_SEG_VARY : BI_SEG_POS,
|
||||||
varying ? bi_varying_offset(b->shader->nir, instr) : 0);
|
varying ? bi_varying_offset(b->shader, instr) : 0);
|
||||||
} else if (immediate) {
|
} else if (immediate) {
|
||||||
bi_index address = bi_lea_attr_imm(b,
|
bi_index address = bi_lea_attr_imm(b,
|
||||||
bi_vertex_id(b), bi_instance_id(b),
|
bi_vertex_id(b), bi_instance_id(b),
|
||||||
|
|
|
@ -42,6 +42,7 @@ GENX(pan_shader_get_compiler_options)(void)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if PAN_ARCH <= 7
|
||||||
static enum pipe_format
|
static enum pipe_format
|
||||||
varying_format(nir_alu_type t, unsigned ncomps)
|
varying_format(nir_alu_type t, unsigned ncomps)
|
||||||
{
|
{
|
||||||
|
@ -157,6 +158,7 @@ collect_varyings(nir_shader *s, nir_variable_mode varying_mode,
|
||||||
*varying_count = MAX2(*varying_count, loc + sz);
|
*varying_count = MAX2(*varying_count, loc + sz);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if PAN_ARCH >= 6
|
#if PAN_ARCH >= 6
|
||||||
static enum mali_register_file_format
|
static enum mali_register_file_format
|
||||||
|
@ -230,8 +232,14 @@ GENX(pan_shader_compile)(nir_shader *s,
|
||||||
|
|
||||||
info->vs.writes_point_size =
|
info->vs.writes_point_size =
|
||||||
s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
|
s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
|
||||||
|
|
||||||
|
#if PAN_ARCH >= 9
|
||||||
|
info->varyings.output_count =
|
||||||
|
util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0);
|
||||||
|
#else
|
||||||
collect_varyings(s, nir_var_shader_out, info->varyings.output,
|
collect_varyings(s, nir_var_shader_out, info->varyings.output,
|
||||||
&info->varyings.output_count);
|
&info->varyings.output_count);
|
||||||
|
#endif
|
||||||
break;
|
break;
|
||||||
case MESA_SHADER_FRAGMENT:
|
case MESA_SHADER_FRAGMENT:
|
||||||
if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
|
if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
|
||||||
|
@ -286,8 +294,13 @@ GENX(pan_shader_compile)(nir_shader *s,
|
||||||
info->fs.reads_face =
|
info->fs.reads_face =
|
||||||
(s->info.inputs_read & (1 << VARYING_SLOT_FACE)) ||
|
(s->info.inputs_read & (1 << VARYING_SLOT_FACE)) ||
|
||||||
BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
|
BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
|
||||||
|
#if PAN_ARCH >= 9
|
||||||
|
info->varyings.output_count =
|
||||||
|
util_last_bit(s->info.outputs_read >> VARYING_SLOT_VAR0);
|
||||||
|
#else
|
||||||
collect_varyings(s, nir_var_shader_in, info->varyings.input,
|
collect_varyings(s, nir_var_shader_in, info->varyings.input,
|
||||||
&info->varyings.input_count);
|
&info->varyings.input_count);
|
||||||
|
#endif
|
||||||
break;
|
break;
|
||||||
case MESA_SHADER_COMPUTE:
|
case MESA_SHADER_COMPUTE:
|
||||||
info->wls_size = s->info.shared_size;
|
info->wls_size = s->info.shared_size;
|
||||||
|
|
|
@ -190,6 +190,16 @@ struct panfrost_compile_inputs {
|
||||||
uint8_t raw_fmt_mask;
|
uint8_t raw_fmt_mask;
|
||||||
unsigned nr_cbufs;
|
unsigned nr_cbufs;
|
||||||
|
|
||||||
|
/* Used on Valhall.
|
||||||
|
*
|
||||||
|
* Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
|
||||||
|
* written by the previous stage (fragment shader) or written by this
|
||||||
|
* stage (vertex shader). Bits are slots from gl_varying_slot.
|
||||||
|
*
|
||||||
|
* For modern APIs (GLES or VK), this should be 0.
|
||||||
|
*/
|
||||||
|
uint32_t fixed_varying_mask;
|
||||||
|
|
||||||
union {
|
union {
|
||||||
struct {
|
struct {
|
||||||
bool static_rt_conv;
|
bool static_rt_conv;
|
||||||
|
|
Loading…
Reference in New Issue