From 75b0c4b5e1ee426aead296f2af854b2b64109972 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Sat, 30 Jan 2021 12:36:55 -0800
Subject: [PATCH] freedreno/ir3: Async shader compile

Draw-time variants are still synchronous, but I'm not sure there is much
(easy) benefit from generating them asynchronously.  Without patching
the cmdstream later before batch submit, we'd end up waiting for them
immediately.  But we should mostly only hit draw-time variants for
desktop GL (and mostly legacy features).

Note: new xfb xfail on a5xx, but most of the xfb tests are already xfail
so I think we just managed to change the timing a bit, rather than this
being related to async compile.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3857
Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8795>
---
 .../freedreno/deqp-freedreno-a530-fails.txt   |   1 +
 .../drivers/freedreno/freedreno_screen.h      |   2 +
 .../drivers/freedreno/ir3/ir3_gallium.c       | 120 ++++++++++++++++--
 3 files changed, 115 insertions(+), 8 deletions(-)

diff --git a/ci-expects/freedreno/deqp-freedreno-a530-fails.txt b/ci-expects/freedreno/deqp-freedreno-a530-fails.txt
index 8671f627187..d399b24ccf8 100644
--- a/ci-expects/freedreno/deqp-freedreno-a530-fails.txt
+++ b/ci-expects/freedreno/deqp-freedreno-a530-fails.txt
@@ -14,6 +14,7 @@ dEQP-GLES3.functional.transform_feedback.array.interleaved.points.highp_mat3x2,F
 dEQP-GLES3.functional.transform_feedback.array.separate.lines.highp_mat3x4,Fail
 dEQP-GLES3.functional.transform_feedback.array.separate.points.lowp_mat2,Fail
 dEQP-GLES3.functional.transform_feedback.array.separate.points.mediump_uint,Fail
+dEQP-GLES3.functional.transform_feedback.array.separate.triangles.lowp_vec3,Fail
 dEQP-GLES3.functional.transform_feedback.array_element.interleaved.lines.highp_uvec4,Fail
 dEQP-GLES3.functional.transform_feedback.array_element.interleaved.points.highp_vec2,Fail
 dEQP-GLES3.functional.transform_feedback.array_element.interleaved.points.lowp_ivec3,Fail
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index 684c42c5dec..e49336f8d10 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -35,6 +35,7 @@
 #include "pipe/p_screen.h"
 #include "util/debug.h"
 #include "util/u_memory.h"
+#include "util/u_queue.h"
 #include "util/slab.h"
 #include "util/simple_mtx.h"
 #include "renderonly/renderonly.h"
@@ -87,6 +88,7 @@ struct fd_screen {
 	struct pipe_driver_query_info *perfcntr_queries;
 
 	void *compiler;          /* currently unused for a2xx */
+	struct util_queue compile_queue; /* currently unused for a2xx */
 
 	struct fd_device *dev;
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 130bfb4295f..65d5846eb84 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -51,8 +51,25 @@
  */
 struct ir3_shader_state {
 	struct ir3_shader *shader;
+
+	/* Fence signalled when async compile is completed: */
+	struct util_queue_fence ready;
 };
 
+/**
+ * Should initial variants be compiled synchronously?
+ *
+ * The only case where pipe_debug_message() is used in the initial-variants
+ * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
+ * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
+ * compile the initial shader variant asynchronously.
+ */
+static bool
+initial_variants_synchronous(struct fd_context *ctx)
+{
+	return unlikely(ctx->debug.debug_message && (fd_mesa_debug & FD_DBG_SHADERDB));
+}
+
 static void
 dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug)
 {
@@ -236,6 +253,27 @@ create_initial_variants(struct ir3_shader_state *hwcso,
 	shader->initial_variants_done = true;
 }
 
+static void
+create_initial_variants_async(void *job, int thread_index)
+{
+	struct ir3_shader_state *hwcso = job;
+	struct pipe_debug_callback debug = {};
+
+	create_initial_variants(hwcso, &debug);
+}
+
+static void
+create_initial_compute_variants_async(void *job, int thread_index)
+{
+	struct ir3_shader_state *hwcso = job;
+	struct ir3_shader *shader = hwcso->shader;
+	struct pipe_debug_callback debug = {};
+	static struct ir3_shader_key key; /* static is implicitly zeroed */
+
+	ir3_shader_variant(shader, key, false, &debug);
+	shader->initial_variants_done = true;
+}
+
 /* a bit annoying that compute-shader and normal shader state objects
  * aren't a bit more aligned.
  */
@@ -271,18 +309,26 @@ ir3_shader_compute_state_create(struct pipe_context *pctx,
 	}
 
 	struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);
+	struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
+
+	util_queue_fence_init(&hwcso->ready);
+	hwcso->shader = shader;
 
 	/* Immediately compile a standard variant.  We have so few variants in our
 	 * shaders, that doing so almost eliminates draw-time recompiles.  (This
 	 * is also how we get data from shader-db's ./run)
 	 */
-	static struct ir3_shader_key key; /* static is implicitly zeroed */
-	ir3_shader_variant(shader, key, false, &ctx->debug);
 
-	shader->initial_variants_done = true;
-
-	struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
-	hwcso->shader = shader;
+	if (initial_variants_synchronous(ctx)) {
+		static struct ir3_shader_key key; /* static is implicitly zeroed */
+		ir3_shader_variant(shader, key, false, &ctx->debug);
+		shader->initial_variants_done = true;
+	} else {
+		struct fd_screen *screen = ctx->screen;
+		util_queue_add_job(&screen->compile_queue, hwcso,
+				&hwcso->ready, create_initial_compute_variants_async,
+				NULL, 0);
+	}
 
 	return hwcso;
 }
@@ -322,10 +368,20 @@ ir3_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_stat
 	hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output);
 
 	/*
-	 * Create initial variants to avoid draw-time stalls:
+	 * Create initial variants to avoid draw-time stalls.  This is
+	 * normally done asynchronously, unless debug is enabled (which
+	 * will be the case for shader-db)
 	 */
 
-	create_initial_variants(hwcso, &ctx->debug);
+	util_queue_fence_init(&hwcso->ready);
+
+	if (initial_variants_synchronous(ctx)) {
+		create_initial_variants(hwcso, &ctx->debug);
+	} else {
+		util_queue_add_job(&ctx->screen->compile_queue, hwcso,
+				&hwcso->ready, create_initial_variants_async,
+				NULL, 0);
+	}
 
 	return hwcso;
 }
@@ -333,9 +389,18 @@ ir3_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_stat
 void
 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
 {
+	struct fd_screen *screen = fd_context(pctx)->screen;
 	struct ir3_shader_state *hwcso = _hwcso;
 	struct ir3_shader *so = hwcso->shader;
 
+	/* util_queue_drop_job() guarantees that either:
+	 *  1) job did not execute
+	 *  2) job completed
+	 *
+	 * In either case the fence is signaled
+	 */
+	util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
+
 	/* free the uploaded shaders, since this is handled outside of the
 	 * shared ir3 code (ie. not used by turnip):
 	 */
@@ -350,6 +415,7 @@ ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
 	}
 
 	ir3_shader_destroy(so);
+	util_queue_fence_destroy(&hwcso->ready);
 	free(hwcso);
 }
 
@@ -358,6 +424,7 @@ ir3_get_shader(struct ir3_shader_state *hwcso)
 {
 	if (!hwcso)
 		return NULL;
+	util_queue_fence_wait(&hwcso->ready);
 	return hwcso->shader;
 }
 
@@ -377,6 +444,26 @@ ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir, bool optimize)
 	ir3_finalize_nir(screen->compiler, nir);
 }
 
+static void
+ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen, unsigned max_threads)
+{
+	struct fd_screen *screen = fd_screen(pscreen);
+
+	/* This function doesn't allow a greater number of threads than
+	 * the queue had at its creation.
+	 */
+	util_queue_adjust_num_threads(&screen->compile_queue, max_threads);
+}
+
+static bool
+ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
+		void *shader, enum pipe_shader_type shader_type)
+{
+	struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
+
+	return util_queue_fence_is_signalled(&hwcso->ready);
+}
+
 void
 ir3_prog_init(struct pipe_context *pctx)
 {
@@ -403,7 +490,23 @@ ir3_screen_init(struct pipe_screen *pscreen)
 
 	screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id);
 
+	/* TODO do we want to limit things to # of fast cores, or just limit
+	 * based on total # of both big and little cores.  The little cores
+	 * tend to be in-order and probably much slower for compiling than
+	 * big cores.  OTOH if they are sitting idle, maybe it is useful to
+	 * use them?
+	 */
+	unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
+
+	util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
+			UTIL_QUEUE_INIT_RESIZE_IF_FULL |
+			UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY);
+
 	pscreen->finalize_nir = ir3_screen_finalize_nir;
+	pscreen->set_max_shader_compiler_threads =
+			ir3_set_max_shader_compiler_threads;
+	pscreen->is_parallel_shader_compilation_finished =
+			ir3_is_parallel_shader_compilation_finished;
 }
 
 void
@@ -411,6 +514,7 @@ ir3_screen_fini(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
 
+	util_queue_destroy(&screen->compile_queue);
 	ir3_compiler_destroy(screen->compiler);
 	screen->compiler = NULL;
 }