freedreno/a6xx: move const emit to state group

Eventually we want to move nearly everything, but no other state depends on const state, so this is the easiest one to move first. For webgl aquarium, this reduces GPU load by about 10%, since for each fish it does a uniform upload plus draw.. fish frequently are visible in only a single tile, so this skips the uniform uploads for other tiles. The additional step of avoiding WFI's when using CP_SET_DRAW_STATE seems to be work an additional 10% gain for aquarium. Signed-off-by: Rob Clark <robdclark@gmail.com>
2018-10-07 13:59:27 -04:00 · 2018-10-07 13:59:27 -04:00 · abcdf5627a
parent a398d26fd2
commit abcdf5627a
4 changed files with 70 additions and 15 deletions
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
@ -359,7 +359,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,

 	if (tex->num_samplers > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4);
+			fd_ringbuffer_new_flags(ctx->pipe, tex->num_samplers * 4 * 4,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
 		for (unsigned i = 0; i < tex->num_samplers; i++) {
 			static const struct fd6_sampler_stateobj dummy_sampler = {};
 			const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ?
@ -389,7 +390,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,

 	if (tex->num_textures > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_object(ctx->pipe, tex->num_textures * 16);
+			fd_ringbuffer_new_flags(ctx->pipe, tex->num_textures * 16 * 4,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
 		for (unsigned i = 0; i < tex->num_textures; i++) {
 			static const struct fd6_pipe_sampler_view dummy_view = {};
 			const struct fd6_pipe_sampler_view *view = tex->textures[i] ?
@ -791,9 +793,29 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
 	}

-	ir3_emit_vs_consts(vp, ring, ctx, emit->info);
-	if (!emit->key.binning_pass)
-		ir3_emit_fs_consts(fp, ring, ctx);
+#define DIRTY_CONST (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST | \
+					 FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)
+
+	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) {
+		struct fd_ringbuffer *vsconstobj =
+			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+		ir3_emit_vs_consts(vp, vsconstobj, ctx, emit->info);
+		fd6_emit_add_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7);
+		fd_ringbuffer_del(vsconstobj);
+	}
+
+	if ((ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) &&
+			!emit->key.binning_pass) {
+		struct fd_ringbuffer *fsconstobj =
+			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+		ir3_emit_fs_consts(fp, fsconstobj, ctx);
+		fd6_emit_add_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x7);
+		fd_ringbuffer_del(fsconstobj);
+	}

 	struct pipe_stream_output_info *info = &vp->shader->stream_output;
 	if (info->num_outputs) {
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
@ -43,7 +43,8 @@ struct fd_ringbuffer;
 * need to be emit'd.
 */
 enum fd6_state_id {
-	FD6_GROUP_CONST,
+	FD6_GROUP_VS_CONST,
+	FD6_GROUP_FS_CONST,
 };

 struct fd6_state_group {
@ -116,7 +117,7 @@ fd6_emit_add_group(struct fd6_emit *emit, struct fd_ringbuffer *stateobj,
 	if (fd_ringbuffer_size(stateobj) == 0)
 		return;
 	struct fd6_state_group *g = &emit->groups[emit->num_groups++];
-	g->stateobj = stateobj;
+	g->stateobj = fd_ringbuffer_ref(stateobj);
 	g->group_id = group_id;
 	g->enable_mask = enable_mask;
 }
--- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
@ -751,6 +751,13 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
 		OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0x5) | 0x10);
 	}

+	OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
+	OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
+			CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
+			CP_SET_DRAW_STATE__0_GROUP_ID(0));
+	OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
+	OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
+
 	OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x0);

--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@ -552,6 +552,18 @@ ir3_shader_outputs(const struct ir3_shader *so)

 #include "freedreno_resource.h"

+static inline void
+ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring)
+{
+	/* when we emit const state via ring (IB2) we need a WFI, but when
+	 * it is emit'd via stateobj, we don't
+	 */
+	if (ring->flags & FD_RINGBUFFER_OBJECT)
+		return;
+
+	fd_wfi(batch, ring);
+}
+
 static void
 emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
@ -579,7 +591,7 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		size = MIN2(size, 4 * max_const);

 		if (size > 0) {
-			fd_wfi(ctx->batch, ring);
+			ring_wfi(ctx->batch, ring);
 			ctx->emit_const(ring, v->type, 0,
 					cb->buffer_offset, size,
 					cb->user_buffer, cb->buffer);
@ -611,7 +623,7 @@ emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}

-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets);
 	}
 }
@ -631,7 +643,7 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			sizes[off] = sb->sb[index].buffer_size;
 		}

-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, offset * 4,
 			0, ARRAY_SIZE(sizes), sizes, NULL);
 	}
@ -673,7 +685,7 @@ emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}

-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, offset * 4,
 			0, ARRAY_SIZE(dims), dims, NULL);
 	}
@ -696,7 +708,7 @@ emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
 	size *= 4;

 	if (size > 0) {
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, base,
 			0, size, v->immediates[0].val, NULL);
 	}
@ -729,7 +741,7 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}

-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets);
 	}
 }
@ -787,6 +799,19 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 {
 	enum fd_dirty_shader_state dirty = ctx->dirty_shader[t];

+	/* When we use CP_SET_DRAW_STATE objects to emit constant state,
+	 * if we emit any of it we need to emit all.  This is because
+	 * we are using the same state-group-id each time for uniform
+	 * state, and if previous update is never evaluated (due to no
+	 * visible primitives in the current tile) then the new stateobj
+	 * completely replaces the old one.
+	 *
+	 * Possibly if we split up different parts of the const state to
+	 * different state-objects we could avoid this.
+	 */
+	if (dirty && (ring->flags & FD_RINGBUFFER_OBJECT))
+		dirty = ~0;
+
 	if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
 		struct fd_constbuf_stateobj *constbuf;
 		bool shader_dirty;
@ -846,7 +871,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 				vertex_params_size = ARRAY_SIZE(vertex_params);
 			}

-			fd_wfi(ctx->batch, ring);
+			ring_wfi(ctx->batch, ring);

 			bool needs_vtxid_base =
 				ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0);
@ -918,7 +943,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 	/* emit compute-shader driver-params: */
 	uint32_t offset = v->constbase.driver_param;
 	if (v->constlen > offset) {
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);

 		if (info->indirect) {
 			struct pipe_resource *indirect = NULL;