panfrost: Launch transform feedback shaders
We now have infrastructure in place to generate variants of vertex shaders specialized for transform feedback. All that's left is launching these compute-like kernels before the IDVS job, implementing both the transform feedback and the regular rasterization pipeline. This implements transform feedback on Valhall, passing the relevant GLES3.1 tests. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15720>
This commit is contained in:
parent
a510a94b02
commit
3b3cd59fb8
|
@ -86,6 +86,10 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
|
|||
.fixed_varying_mask = state->key.fixed_varying_mask
|
||||
};
|
||||
|
||||
/* No IDVS for internal XFB shaders */
|
||||
if (s->info.stage == MESA_SHADER_VERTEX && s->info.has_transform_feedback_varyings)
|
||||
inputs.no_idvs = true;
|
||||
|
||||
memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats));
|
||||
|
||||
struct util_dynarray binary;
|
||||
|
|
|
@ -2215,6 +2215,7 @@ panfrost_emit_varyings(struct panfrost_batch *batch,
|
|||
return ptr;
|
||||
}
|
||||
|
||||
#if PAN_ARCH <= 5
|
||||
static void
|
||||
panfrost_emit_streamout(struct panfrost_batch *batch,
|
||||
struct mali_attribute_buffer_packed *slot,
|
||||
|
@ -2256,6 +2257,7 @@ pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
|
|||
|
||||
unreachable("Varying not captured");
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Given a varying, figure out which index it corresponds to */
|
||||
|
||||
|
@ -2445,8 +2447,12 @@ panfrost_emit_varying(const struct panfrost_device *dev,
|
|||
gl_varying_slot loc = varying.location;
|
||||
mali_pixel_format format = dev->formats[pipe_format].hw;
|
||||
|
||||
#if PAN_ARCH <= 5
|
||||
struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ?
|
||||
pan_get_so(xfb, loc) : NULL;
|
||||
#else
|
||||
struct pipe_stream_output *o = NULL;
|
||||
#endif
|
||||
|
||||
if (util_varying_is_point_coord(loc, point_sprite_mask)) {
|
||||
pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
|
||||
|
@ -2604,7 +2610,7 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
|
|||
/* In good conditions, we only need to link varyings once */
|
||||
bool prelink =
|
||||
(point_coord_mask == 0) &&
|
||||
(ctx->streamout.num_targets == 0) &&
|
||||
(PAN_ARCH >= 6 || ctx->streamout.num_targets == 0) &&
|
||||
!vs->info.separable &&
|
||||
!fs->info.separable;
|
||||
|
||||
|
@ -2620,7 +2626,6 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
|
|||
panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage);
|
||||
}
|
||||
|
||||
struct pipe_stream_output_info *so = &vs->stream_output;
|
||||
unsigned present = linkage->present, stride = linkage->stride;
|
||||
unsigned xfb_base = pan_xfb_base(present);
|
||||
struct panfrost_ptr T =
|
||||
|
@ -2637,11 +2642,12 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
|
|||
#if PAN_ARCH >= 6
|
||||
/* Suppress prefetch on Bifrost */
|
||||
memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings));
|
||||
#endif
|
||||
|
||||
#else
|
||||
/* Emit the stream out buffers. We need enough room for all the
|
||||
* vertices we emit across all instances */
|
||||
|
||||
struct pipe_stream_output_info *so = &vs->stream_output;
|
||||
|
||||
unsigned out_count = ctx->instance_count *
|
||||
u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
|
||||
|
||||
|
@ -2651,6 +2657,7 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
|
|||
out_count,
|
||||
ctx->streamout.targets[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (stride) {
|
||||
panfrost_emit_varyings(batch,
|
||||
|
@ -2683,6 +2690,11 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
|
|||
*fs_attribs = linkage->consumer;
|
||||
}
|
||||
|
||||
/*
|
||||
* Emit jobs required for the rasterization pipeline. If there are side effects
|
||||
* from the vertex shader, these are handled ahead-of-time with a compute
|
||||
* shader. This function should not be called if rasterization is skipped.
|
||||
*/
|
||||
static void
|
||||
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
|
||||
const struct panfrost_ptr *vertex_job,
|
||||
|
@ -2690,20 +2702,16 @@ panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
|
|||
{
|
||||
struct panfrost_context *ctx = batch->ctx;
|
||||
|
||||
/* If rasterizer discard is enable, only submit the vertex. XXX - set
|
||||
* job_barrier in case buffers get ping-ponged and we need to enforce
|
||||
* ordering, this has a perf hit! See
|
||||
* KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */
|
||||
|
||||
/* XXX - set job_barrier in case buffers get ping-ponged and we need to
|
||||
* enforce ordering, this has a perf hit! See
|
||||
* KHR-GLES31.core.vertex_attrib_binding.advanced-iterations
|
||||
*/
|
||||
unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard,
|
||||
MALI_JOB_TYPE_VERTEX, true, false,
|
||||
ctx->indirect_draw ?
|
||||
batch->indirect_draw_job_id : 0,
|
||||
0, vertex_job, false);
|
||||
|
||||
if (panfrost_batch_skip_rasterization(batch))
|
||||
return;
|
||||
|
||||
panfrost_add_job(&batch->pool.base, &batch->scoreboard,
|
||||
MALI_JOB_TYPE_TILER, false, false,
|
||||
vertex, 0, tiler_job, false);
|
||||
|
@ -3533,6 +3541,89 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
|
|||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
panfrost_launch_xfb(struct panfrost_batch *batch,
|
||||
const struct pipe_draw_info *info,
|
||||
mali_ptr attribs, mali_ptr attrib_bufs,
|
||||
unsigned count)
|
||||
{
|
||||
struct panfrost_context *ctx = batch->ctx;
|
||||
|
||||
struct panfrost_ptr t =
|
||||
pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
|
||||
|
||||
/* Nothing to do */
|
||||
if (batch->ctx->streamout.num_targets == 0)
|
||||
return;
|
||||
|
||||
/* TODO: XFB with index buffers */
|
||||
//assert(info->index_size == 0);
|
||||
u_trim_pipe_prim(info->mode, &count);
|
||||
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
|
||||
struct panfrost_shader_variants v = { .variants = vs->xfb };
|
||||
|
||||
vs->xfb->stream_output = vs->stream_output;
|
||||
|
||||
struct panfrost_shader_variants *saved_vs = ctx->shader[PIPE_SHADER_VERTEX];
|
||||
mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX];
|
||||
mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX];
|
||||
mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX];
|
||||
|
||||
ctx->shader[PIPE_SHADER_VERTEX] = &v;
|
||||
batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
|
||||
|
||||
#if PAN_ARCH >= 9
|
||||
pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
|
||||
cfg.workgroup_size_x = 1;
|
||||
cfg.workgroup_size_y = 1;
|
||||
cfg.workgroup_size_z = 1;
|
||||
|
||||
cfg.workgroup_count_x = count;
|
||||
cfg.workgroup_count_y = info->instance_count;
|
||||
cfg.workgroup_count_z = 1;
|
||||
|
||||
panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_VERTEX,
|
||||
batch->rsd[PIPE_SHADER_VERTEX],
|
||||
batch->tls.gpu);
|
||||
|
||||
/* TODO: Indexing. Also, this is a legacy feature... */
|
||||
cfg.compute.attribute_offset = batch->ctx->offset_start;
|
||||
|
||||
/* Transform feedback shaders do not use barriers or shared
|
||||
* memory, so we may merge workgroups.
|
||||
*/
|
||||
cfg.allow_merging_workgroups = true;
|
||||
cfg.task_increment = 1;
|
||||
cfg.task_axis = MALI_TASK_AXIS_Z;
|
||||
}
|
||||
#else
|
||||
struct mali_invocation_packed invocation;
|
||||
|
||||
panfrost_pack_work_groups_compute(&invocation,
|
||||
1, count, info->instance_count,
|
||||
1, 1, 1, false, false);
|
||||
|
||||
batch->uniform_buffers[PIPE_SHADER_VERTEX] =
|
||||
panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
|
||||
&batch->push_uniforms[PIPE_SHADER_VERTEX], NULL);
|
||||
|
||||
panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0,
|
||||
attribs, attrib_bufs, t.cpu);
|
||||
#endif
|
||||
panfrost_add_job(&batch->pool.base, &batch->scoreboard,
|
||||
MALI_JOB_TYPE_COMPUTE, true, false,
|
||||
0, 0, &t, false);
|
||||
|
||||
ctx->shader[PIPE_SHADER_VERTEX] = saved_vs;
|
||||
batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
|
||||
batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo;
|
||||
batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
|
||||
}
|
||||
|
||||
static void
|
||||
panfrost_direct_draw(struct panfrost_batch *batch,
|
||||
const struct pipe_draw_info *info,
|
||||
|
@ -3657,6 +3748,24 @@ panfrost_direct_draw(struct panfrost_batch *batch,
|
|||
panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
|
||||
panfrost_clean_state_3d(ctx);
|
||||
|
||||
#if PAN_ARCH >= 6
|
||||
if (vs->xfb) {
|
||||
#if PAN_ARCH >= 9
|
||||
mali_ptr attribs = 0, attrib_bufs = 0;
|
||||
#endif
|
||||
panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Increment transform feedback offsets */
|
||||
panfrost_update_streamout_offsets(ctx);
|
||||
|
||||
/* Any side effects must be handled by the XFB shader, so we only need
|
||||
* to run vertex shaders if we need rasterization.
|
||||
*/
|
||||
if (panfrost_batch_skip_rasterization(batch))
|
||||
return;
|
||||
|
||||
#if PAN_ARCH >= 9
|
||||
assert(idvs && "Memory allocated IDVS required on Valhall");
|
||||
|
||||
|
@ -3667,10 +3776,10 @@ panfrost_direct_draw(struct panfrost_batch *batch,
|
|||
MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0,
|
||||
0, &tiler, false);
|
||||
#else
|
||||
/* Fire off the draw itself */
|
||||
panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
|
||||
fs_vary, varyings, pos, psiz, secondary_shader,
|
||||
tiler.cpu);
|
||||
|
||||
if (idvs) {
|
||||
#if PAN_ARCH >= 6
|
||||
panfrost_draw_emit_vertex_section(batch,
|
||||
|
@ -3688,9 +3797,6 @@ panfrost_direct_draw(struct panfrost_batch *batch,
|
|||
panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Increment transform feedback offsets */
|
||||
panfrost_update_streamout_offsets(ctx);
|
||||
}
|
||||
|
||||
#if PAN_GPU_INDIRECTS
|
||||
|
@ -3912,22 +4018,6 @@ panfrost_draw_vbo(struct pipe_context *pipe,
|
|||
if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
|
||||
batch->viewport = panfrost_emit_viewport(batch);
|
||||
|
||||
/* If rasterization discard is enabled but the vertex shader does not
|
||||
* have side effects (including transform feedback), skip the draw
|
||||
* altogether. This is always an optimization. Additionally, this is
|
||||
* required for Index-Driven Vertex Shading, since IDVS always
|
||||
* rasterizes. The compiler will not use IDVS if the vertex shader has
|
||||
* side effects. So the only problem case is rasterizer discard with a
|
||||
* shader without side effects -- but these draws are useless.
|
||||
*/
|
||||
if (panfrost_batch_skip_rasterization(batch)) {
|
||||
struct panfrost_shader_state *vs =
|
||||
panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
|
||||
|
||||
if (!vs->info.writes_global)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Mark everything dirty when debugging */
|
||||
if (unlikely(dev->debug & PAN_DBG_DIRTY))
|
||||
panfrost_dirty_state_all(ctx);
|
||||
|
|
|
@ -901,6 +901,9 @@ panfrost_batch_union_scissor(struct panfrost_batch *batch,
|
|||
/**
|
||||
* Checks if rasterization should be skipped. If not, a TILER job must be
|
||||
* created for each draw, or the IDVS flow must be used.
|
||||
*
|
||||
* As a special case, if there is no vertex shader, no primitives are generated,
|
||||
* meaning the whole pipeline (including rasterization) should be skipped.
|
||||
*/
|
||||
bool
|
||||
panfrost_batch_skip_rasterization(struct panfrost_batch *batch)
|
||||
|
@ -909,5 +912,6 @@ panfrost_batch_skip_rasterization(struct panfrost_batch *batch)
|
|||
struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer;
|
||||
|
||||
return (rast->rasterizer_discard ||
|
||||
batch->scissor_culls_everything);
|
||||
batch->scissor_culls_everything ||
|
||||
!batch->rsd[PIPE_SHADER_VERTEX]);
|
||||
}
|
||||
|
|
|
@ -5133,6 +5133,14 @@ bi_compile_variant(nir_shader *nir,
|
|||
|
||||
unsigned offset = binary->size;
|
||||
|
||||
/* If there is no position shader (gl_Position is not written), then
|
||||
* there is no need to build a varying shader either. This case is hit
|
||||
* for transform feedback only vertex shaders which only make sense with
|
||||
* rasterizer discard.
|
||||
*/
|
||||
if ((offset == 0) && (idvs == BI_IDVS_VARYING))
|
||||
return;
|
||||
|
||||
/* Software invariant: Only a secondary shader can appear at a nonzero
|
||||
* offset, to keep the ABI simple. */
|
||||
assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
|
||||
|
@ -5213,22 +5221,6 @@ bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
|
|||
if (nir->info.stage != MESA_SHADER_VERTEX)
|
||||
return false;
|
||||
|
||||
/* Transform feedback requires running all varying shaders regardless
|
||||
* of clipping, but IDVS does clipping before running varying shaders.
|
||||
* So shaders destined for transform feedback must not use IDVS.
|
||||
*
|
||||
* The issue with general memory stores is more subtle: these shaders
|
||||
* have side effects and only make sense if vertex shaders run exactly
|
||||
* once per vertex. IDVS requires the hardware to rerun position or
|
||||
* varying shaders in certain circumstances. So if there is any memory
|
||||
* write, disable IDVS.
|
||||
*
|
||||
* NIR considers transform feedback to be a memory write, so we only
|
||||
* need to check writes_memory to handle both cases.
|
||||
*/
|
||||
if (nir->info.writes_memory)
|
||||
return false;
|
||||
|
||||
/* Bifrost cannot write gl_PointSize during IDVS */
|
||||
if ((inputs->gpu_id < 0x9000) &&
|
||||
nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
|
||||
|
|
Loading…
Reference in New Issue