radeonsi/gfx9: always compile monolithic ES-GS (asynchronously)

In addition to the non-monolithic variant. Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
2017-04-19 01:53:35 +02:00 · 2017-04-19 01:53:35 +02:00 · 2857b14bba
parent a82398a8f5
commit 2857b14bba
2 changed files with 28 additions and 1 deletions
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -445,12 +445,20 @@ struct si_shader_key {
 	} mono;

 	/* Optimization flags for asynchronous compilation only. */
-	union {
+	struct {
 		struct {
 			uint64_t	kill_outputs; /* "get_unique_index" bits */
 			uint32_t	kill_outputs2; /* "get_unique_index2" bits */
 			unsigned	clip_disable:1;
 		} hw_vs; /* HW VS (it can be VS, TES, GS) */
+
+		/* For shaders where monolithic variants have better code.
+		 *
+		 * This is a flag that has no effect on code generation,
+		 * but forces monolithic shaders to be used as soon as
+		 * possible, because it's in the "opt" group.
+		 */
+		unsigned	prefer_mono:1;
 	} opt;
 };

--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@ -1279,6 +1279,25 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 							  key, &key->part.gs.vs_prolog);
 				key->part.gs.es = sctx->vs_shader.cso;
 			}
+
+			/* Merged ES-GS can have unbalanced wave usage.
+			 *
+			 * ES threads are per-vertex, while GS threads are
+			 * per-primitive. So without any amplification, there
+			 * are fewer GS threads than ES threads, which can result
+			 * in empty (no-op) GS waves. With too much amplification,
+			 * there are more GS threads than ES threads, which
+			 * can result in empty (no-op) ES waves.
+			 *
+			 * Non-monolithic shaders are implemented by setting EXEC
+			 * at the beginning of shader parts, and don't jump to
+			 * the end if EXEC is 0.
+			 *
+			 * Monolithic shaders use conditional blocks, so they can
+			 * jump and skip empty waves of ES or GS. So set this to
+			 * always use optimized variants, which are monolithic.
+			 */
+			key->opt.prefer_mono = 1;
 		}
 		key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
 		break;