freedreno/a3xx/compiler: half-precision output

Using generic shaders caused a measurable fps drop, which was isolated to use of full precision (vs half precision) output. This is an attempt to regain that lost performance by using half precision solid/blit shaders (when the output format is not float32). Note: for the built-in shaders, I would not expect them to be register starved. And in fact it is the solid frag shader that seems to have the biggest impact. So I suspect you get double the pixel pipe units (or half the cycles) when the output is half precision. So there may be some gain to using half precision output for application shaders as well, even though the rest of register usage is still full precision. But for half precision to work for more complex shaders, we need to deal with some constraints, like cat2 needing same precision for it's two src registers. So for now it is not enabled by default except for the built-in shaders. Signed-off-by: Rob Clark <robclark@freedesktop.org>
2014-02-22 09:46:39 -05:00 · 2014-02-22 09:46:39 -05:00 · 3f7239ca0e
parent 141ae71671
commit 3f7239ca0e
6 changed files with 130 additions and 10 deletions
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@ -48,6 +48,25 @@
 #include "instr-a3xx.h"
 #include "ir3.h"
 /* NOTE on half/full precision:
 * Currently, the front end (ie. basically this file) does everything in
 * full precision (with the exception of trans_arl() which doesn't work
 * currently.. we reject anything with relative addressing and fallback
 * to old compiler).
 *
 * In the RA step, if half_precision, it will assign the output to hr0.x
 * but use full precision everywhere else.
 *
 * Eventually we'll need a better way to communicate type information
 * to RA so that it can more properly assign both half and full precision
 * registers.  (And presumably double precision pairs for a4xx?)  This
 * would let us make more use of half precision registers, while still
 * keeping things like tex coords in full precision registers.
 *
 * Since the RA is dealing with patching instruction types for half
 * precision output, we can ignore that in the front end and just always
 * create full precision instructions.
 */
 struct fd3_compile_context {
 	const struct tgsi_token *tokens;
@ -2030,7 +2049,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
 		ir3_dump_instr_list(ctx.block->head);
 	}
-	ret = ir3_block_ra(ctx.block, so->type);
+	ret = ir3_block_ra(ctx.block, so->type, key.half_precision);
 	if (ret)
 		goto out;
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@ -103,6 +103,9 @@ fd3_draw(struct fd_context *ctx, const struct pipe_draw_info *info)
 			/* do binning pass first: */
 			.binning_pass = true,
 			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
 	};
 	draw_impl(ctx, info, ctx->binning_ring,
 			dirty & ~(FD_DIRTY_BLEND), key);
@ -126,6 +129,7 @@ fd3_clear_binning(struct fd_context *ctx, unsigned dirty)
 	struct fd_ringbuffer *ring = ctx->binning_ring;
 	struct fd3_shader_key key = {
 			.binning_pass = true,
 			.half_precision = true,
 	};
 	fd3_emit_state(ctx, ring, &ctx->solid_prog, dirty, key);
@ -166,6 +170,7 @@ fd3_clear(struct fd_context *ctx, unsigned buffers,
 	unsigned dirty = ctx->dirty;
 	unsigned ce, i;
 	struct fd3_shader_key key = {
 			.half_precision = true,
 	};
 	dirty &= FD_DIRTY_VIEWPORT | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR;
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@ -44,6 +44,9 @@
 #include "fd3_zsa.h"
 static const struct fd3_shader_key key = {
 		// XXX should set this based on render target format!  We don't
 		// want half_precision if float32 render target!!!
 		.half_precision = true,
 };
 static void
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@ -101,7 +101,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
 	v->type = so->type;
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump tgsi: type=%d", so->type);
+		DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", so->type,
 			key.binning_pass, key.color_two_side, key.half_precision);
 		tgsi_dump(tokens, 0);
 	}
@ -138,7 +139,8 @@ create_variant(struct fd3_shader_stateobj *so, struct fd3_shader_key key)
 		fixup_vp_regfootprint(v);
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("disassemble: type=%d", v->type);
+		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
 			key.binning_pass, key.color_two_side, key.half_precision);
 		disasm_a3xx(fd_bo_map(v->bo), v->info.sizedwords, 0, v->type);
 	}
--- a/src/gallium/drivers/freedreno/a3xx/ir3.h
+++ b/src/gallium/drivers/freedreno/a3xx/ir3.h
@ -379,7 +379,8 @@ void ir3_block_cp(struct ir3_block *block);
 void ir3_block_sched(struct ir3_block *block);
 /* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type);
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 		bool half_precision);
 #ifndef ARRAY_SIZE
--- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
@ -53,10 +53,19 @@
 struct ir3_ra_ctx {
 	struct ir3_block *block;
 	enum shader_t type;
 	bool half_precision;
 	int cnt;
 	bool error;
 };
 /* sorta ugly way to retrofit half-precision support.. rather than
 * passing extra param around, just OR in a high bit.  All the low
 * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
 * will continue to work as long as you don't underflow (and that
 * would go badly anyways).
 */
 #define REG_HALF  0x8000
 struct ir3_ra_assignment {
 	int8_t  off;        /* offset of instruction dst within range */
 	uint8_t num;        /* number of components for the range */
@ -91,7 +100,7 @@ static int output_base(struct ir3_ra_ctx *ctx)
 	 * see how because the blob driver always uses r0.x (ie.
 	 * all zeros)
 	 */
-	if (ctx->type == SHADER_FRAGMENT)
+	if ((ctx->type == SHADER_FRAGMENT) && !ctx->half_precision)
 		return 2;
 	return 0;
 }
@ -348,12 +357,88 @@ static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v)
 	return (struct ra_assign_visitor *)v;
 }
 static type_t half_type(type_t type)
 {
 	switch (type) {
 	case TYPE_F32: return TYPE_F16;
 	case TYPE_U32: return TYPE_U16;
 	case TYPE_S32: return TYPE_S16;
 	/* instructions may already be fixed up: */
 	case TYPE_F16:
 	case TYPE_U16:
 	case TYPE_S16:
 		return type;
 	default:
 		assert(0);
 		return ~0;
 	}
 }
 /* some instructions need fix-up if dst register is half precision: */
 static void fixup_half_instr_dst(struct ir3_instruction *instr)
 {
 	switch (instr->category) {
 	case 1: /* move instructions */
 		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
 		break;
 	case 3:
 		switch (instr->opc) {
 		case OPC_MAD_F32:
 			instr->opc = OPC_MAD_F16;
 			break;
 		case OPC_SEL_B32:
 			instr->opc = OPC_SEL_B16;
 			break;
 		case OPC_SEL_S32:
 			instr->opc = OPC_SEL_S16;
 			break;
 		case OPC_SEL_F32:
 			instr->opc = OPC_SEL_F16;
 			break;
 		case OPC_SAD_S32:
 			instr->opc = OPC_SAD_S16;
 			break;
 		/* instructions may already be fixed up: */
 		case OPC_MAD_F16:
 		case OPC_SEL_B16:
 		case OPC_SEL_S16:
 		case OPC_SEL_F16:
 		case OPC_SAD_S16:
 			break;
 		default:
 			assert(0);
 			break;
 		}
 		break;
 	case 5:
 		instr->cat5.type = half_type(instr->cat5.type);
 		break;
 	}
 }
 /* some instructions need fix-up if src register is half precision: */
 static void fixup_half_instr_src(struct ir3_instruction *instr)
 {
 	switch (instr->category) {
 	case 1: /* move instructions */
 		instr->cat1.src_type = half_type(instr->cat1.src_type);
 		break;
 	}
 }
 static void ra_assign_reg(struct ir3_visitor *v,
 		struct ir3_instruction *instr, struct ir3_register *reg)
 {
 	struct ra_assign_visitor *a = ra_assign_visitor(v);
 	reg->flags &= ~IR3_REG_SSA;
-	reg->num = a->num;
+	reg->num = a->num & ~REG_HALF;
 	if (a->num & REG_HALF) {
 		reg->flags |= IR3_REG_HALF;
 		/* if dst reg being assigned, patch up the instr: */
 		if (reg == instr->regs[0])
 			fixup_half_instr_dst(instr);
 		else
 			fixup_half_instr_src(instr);
 	}
 }
 static void ra_assign_dst_shader_input(struct ir3_visitor *v,
@ -429,8 +514,8 @@ static void ra_assign(struct ir3_ra_ctx *ctx,
 	/* if we've already visited this instruction, bail now: */
 	if (ir3_instr_check_mark(assigner)) {
-		debug_assert(assigner->regs[0]->num == num);
+		debug_assert(assigner->regs[0]->num == (num & ~REG_HALF));
-		if (assigner->regs[0]->num != num) {
+		if (assigner->regs[0]->num != (num & ~REG_HALF)) {
 			/* impossible situation, should have been resolved
 			 * at an earlier stage by inserting extra mov's:
 			 */
@ -593,6 +678,9 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		base = alloc_block(ctx, NULL, block->noutputs + off);
 		if (ctx->half_precision)
 			base |= REG_HALF;
 		for (i = 0; i < block->noutputs; i++)
 			if (block->outputs[i])
 				ra_assign(ctx, block->outputs[i], base + i + off);
@ -600,7 +688,7 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		if (ctx->type == SHADER_FRAGMENT) {
 			for (i = 0; i < block->ninputs; i++)
 				if (block->inputs[i])
-					ra_assign(ctx, block->inputs[i], base + i);
+					ra_assign(ctx, block->inputs[i], (base & ~REG_HALF) + i);
 		} else {
 			for (i = 0; i < block->ninputs; i++)
 				if (block->inputs[i])
@ -623,11 +711,13 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 	return 0;
 }
-int ir3_block_ra(struct ir3_block *block, enum shader_t type)
+int ir3_block_ra(struct ir3_block *block, enum shader_t type,
 		bool half_precision)
 {
 	struct ir3_ra_ctx ctx = {
 			.block = block,
 			.type = type,
 			.half_precision = half_precision,
 	};
 	ir3_shader_clear_mark(block->shader);
 	return block_ra(&ctx, block);