From 579473f8f838aade82ad58949902910fa5fe15e3 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sun, 16 Feb 2014 07:35:20 -0500
Subject: [PATCH] freedreno/a3xx/compiler: handle kill properly (new compiler)

Since 'kill' does not produce a result, the new compiler was happily
optimizing them out.  We need to instead track 'kill's similar to
outputs.  But since there is no non-predicated kill instruction,
(and for flattend if/else we do want them to be predicated), we need
to track the topmost branch condition on the stack and use that as src
arg to the kill.  For a kill at the topmost level, we have to generate
an immediate 1.0 to feed into the cmps.f for setting the predicate
register.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 .../drivers/freedreno/a3xx/fd3_compiler.c     | 113 +++++++++++++++---
 src/gallium/drivers/freedreno/a3xx/ir3.h      |  10 +-
 .../drivers/freedreno/a3xx/ir3_depth.c        |   2 +-
 src/gallium/drivers/freedreno/a3xx/ir3_ra.c   |   6 +-
 4 files changed, 105 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
index b1ed2e09457..30c2b51743d 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -103,9 +103,16 @@ struct fd3_compile_context {
 	/* stack of branch instructions that mark (potentially nested)
 	 * branch if/else/loop/etc
 	 */
-	struct ir3_instruction *branch[16];
+	struct {
+		struct ir3_instruction *instr, *cond;
+		bool inv;   /* true iff in else leg of branch */
+	} branch[16];
 	unsigned int branch_count;
 
+	/* list of kill instructions: */
+	struct ir3_instruction *kill[16];
+	unsigned int kill_count;
+
 	/* used when dst is same as one of the src, to avoid overwriting a
 	 * src element before the remaining scalar instructions that make
 	 * up the vector operation
@@ -135,6 +142,7 @@ compile_init(struct fd3_compile_context *ctx, struct fd3_shader_stateobj *so,
 	ctx->next_inloc = 8;
 	ctx->num_internal_temps = 0;
 	ctx->branch_count = 0;
+	ctx->kill_count = 0;
 	ctx->block = NULL;
 	ctx->current_instr = NULL;
 	ctx->num_output_updates = 0;
@@ -274,6 +282,9 @@ push_block(struct fd3_compile_context *ctx)
 	ntmp = SCALAR_REGS(TEMPORARY);
 	ntmp += 4 * 4;
 
+	nout = SCALAR_REGS(OUTPUT);
+	nin  = SCALAR_REGS(INPUT);
+
 	/* for outermost block, 'inputs' are the actual shader INPUT
 	 * register file.  Reads from INPUT registers always go back to
 	 * top block.  For nested blocks, 'inputs' is used to track any
@@ -284,17 +295,19 @@ push_block(struct fd3_compile_context *ctx)
 		/* NOTE: fragment shaders actually have two inputs (r0.xy, the
 		 * position)
 		 */
-		nin = SCALAR_REGS(INPUT);
-		if (ctx->type == TGSI_PROCESSOR_FRAGMENT)
+		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 			nin = MAX2(2, nin);
+			nout += ARRAY_SIZE(ctx->kill);
+		}
 	} else {
 		nin = ntmp;
 	}
 
-	nout = SCALAR_REGS(OUTPUT);
-
 	block = ir3_block_create(ctx->ir, ntmp, nin, nout);
 
+	if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
+		block->noutputs -= ARRAY_SIZE(ctx->kill);
+
 	block->parent = ctx->block;
 	ctx->block = block;
 
@@ -1246,15 +1259,23 @@ trans_cmp(const struct instr_translater *t,
  */
 
 static void
-push_branch(struct fd3_compile_context *ctx, struct ir3_instruction *instr)
+push_branch(struct fd3_compile_context *ctx, bool inv,
+		struct ir3_instruction *instr, struct ir3_instruction *cond)
 {
-	ctx->branch[ctx->branch_count++] = instr;
+	unsigned int idx = ctx->branch_count++;
+	compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
+	ctx->branch[idx].instr = instr;
+	ctx->branch[idx].inv = inv;
+	/* else side of branch has same condition: */
+	if (!inv)
+		ctx->branch[idx].cond = cond;
 }
 
 static struct ir3_instruction *
 pop_branch(struct fd3_compile_context *ctx)
 {
-	return ctx->branch[--ctx->branch_count];
+	unsigned int idx = --ctx->branch_count;
+	return ctx->branch[idx].instr;
 }
 
 static void
@@ -1262,7 +1283,7 @@ trans_if(const struct instr_translater *t,
 		struct fd3_compile_context *ctx,
 		struct tgsi_full_instruction *inst)
 {
-	struct ir3_instruction *instr;
+	struct ir3_instruction *instr, *cond;
 	struct tgsi_src_register *src = &inst->Src[0].Register;
 	struct tgsi_dst_register tmp_dst;
 	struct tgsi_src_register *tmp_src;
@@ -1274,25 +1295,22 @@ trans_if(const struct instr_translater *t,
 	if (is_const(src))
 		src = get_unconst(ctx, src);
 
-	/* cmps.f.eq tmp0, b, {0.0} */
+	/* cmps.f.ne tmp0, b, {0.0} */
 	instr = instr_create(ctx, 2, OPC_CMPS_F);
 	add_dst_reg(ctx, instr, &tmp_dst, 0);
 	add_src_reg(ctx, instr, src, src->SwizzleX);
 	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
-	instr->cat2.condition = IR3_COND_EQ;
+	instr->cat2.condition = IR3_COND_NE;
 
-	/* add.s tmp0, tmp0, -1 */
-	instr = instr_create(ctx, 2, OPC_ADD_S);
-	add_dst_reg(ctx, instr, &tmp_dst, TGSI_SWIZZLE_X);
-	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
-	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -1;
+	compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
+	cond = instr->regs[1]->instr;
 
 	/* meta:flow tmp0 */
 	instr = instr_create(ctx, -1, OPC_META_FLOW);
 	ir3_reg_create(instr, 0, 0);  /* dummy dst */
 	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
 
-	push_branch(ctx, instr);
+	push_branch(ctx, false, instr, cond);
 	instr->flow.if_block = push_block(ctx);
 }
 
@@ -1310,7 +1328,7 @@ trans_else(const struct instr_translater *t,
 	compile_assert(ctx, (instr->category == -1) &&
 			(instr->opc == OPC_META_FLOW));
 
-	push_branch(ctx, instr);
+	push_branch(ctx, true, instr, NULL);
 	instr->flow.else_block = push_block(ctx);
 }
 
@@ -1483,6 +1501,53 @@ trans_endif(const struct instr_translater *t,
 	// TODO maybe we want to compact block->inputs?
 }
 
+/*
+ * Kill / Kill-if
+ */
+
+static void
+trans_kill(const struct instr_translater *t,
+		struct fd3_compile_context *ctx,
+		struct tgsi_full_instruction *inst)
+{
+	struct ir3_instruction *instr, *immed, *cond = NULL;
+	bool inv = false;
+
+	switch (t->tgsi_opc) {
+	case TGSI_OPCODE_KILL:
+		/* unconditional kill, use enclosing if condition: */
+		if (ctx->branch_count > 0) {
+			unsigned int idx = ctx->branch_count - 1;
+			cond = ctx->branch[idx].cond;
+			inv = ctx->branch[idx].inv;
+		} else {
+			cond = create_immed(ctx, 1.0);
+		}
+
+		break;
+	}
+
+	compile_assert(ctx, cond);
+
+	immed = create_immed(ctx, 0.0);
+
+	/* cmps.f.ne p0.x, cond, {0.0} */
+	instr = instr_create(ctx, 2, OPC_CMPS_F);
+	instr->cat2.condition = IR3_COND_NE;
+	ir3_reg_create(instr, regid(REG_P0, 0), 0);
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
+	cond = instr;
+
+	/* kill p0.x */
+	instr = instr_create(ctx, 0, OPC_KILL);
+	instr->cat0.inv = inv;
+	ir3_reg_create(instr, 0, 0);  /* dummy dst */
+	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
+
+	ctx->kill[ctx->kill_count++] = instr;
+}
+
 /*
  * Handlers for TGSI instructions which do have 1:1 mapping to native
  * instructions:
@@ -1672,7 +1737,7 @@ static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
 	INSTR(ELSE,         trans_else),
 	INSTR(ENDIF,        trans_endif),
 	INSTR(END,          instr_cat0, .opc = OPC_END),
-	INSTR(KILL,         instr_cat0, .opc = OPC_KILL),
+	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
 };
 
 static fd3_semantic
@@ -1944,6 +2009,16 @@ fd3_compile_shader(struct fd3_shader_stateobj *so,
 
 	compile_instructions(&ctx);
 
+	/* at this point, we want the kill's in the outputs array too,
+	 * so that they get scheduled (since they have no dst).. we've
+	 * already ensured that the array is big enough in push_block():
+	 */
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
+		struct ir3_block *block = ctx.block;
+		for (i = 0; i < ctx.kill_count; i++)
+			block->outputs[block->noutputs++] = ctx.kill[i];
+	}
+
 	if (fd_mesa_debug & FD_DBG_OPTDUMP)
 		compile_dump(&ctx);
 
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h
index ccd3b0b54b4..9c57a653553 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3.h
+++ b/src/gallium/drivers/freedreno/a3xx/ir3.h
@@ -307,6 +307,11 @@ static inline uint32_t reg_comp(struct ir3_register *reg)
 	return reg->num & 0x3;
 }
 
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+	return (instr->category == 0);
+}
+
 static inline bool is_alu(struct ir3_instruction *instr)
 {
 	return (1 <= instr->category) && (instr->category <= 3);
@@ -336,11 +341,6 @@ static inline bool is_meta(struct ir3_instruction *instr)
 	return (instr->category == -1);
 }
 
-static inline bool is_gpr(struct ir3_register *reg)
-{
-	return !(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED));
-}
-
 /* TODO combine is_gpr()/reg_gpr().. */
 static inline bool reg_gpr(struct ir3_register *r)
 {
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c
index 580ae08da2c..452257884df 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c
@@ -68,7 +68,7 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 		return 0;
 
 	/* assigner must be alu: */
-	if (is_sfu(consumer) || is_tex(consumer)) {
+	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) {
 		return 8;
 	} else if ((consumer->category == 3) &&
 			is_mad(consumer->opc) && (n == 2)) {
diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
index 6c868e21791..22c58e6c9ad 100644
--- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c
@@ -456,6 +456,10 @@ static void ir3_instr_ra(struct ir3_ra_ctx *ctx,
 	if (instr->regs_count == 0)
 		return;
 
+	/* skip writes to a0, p0, etc */
+	if (!reg_gpr(instr->regs[0]))
+		return;
+
 	/* if we've already visited this instruction, bail now: */
 	if (instr->flags & IR3_INSTR_MARK)
 		return;
@@ -493,7 +497,7 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		for (i = 1; i < n->regs_count; i++) {
 			struct ir3_register *reg = n->regs[i];
 
-			if (is_gpr(reg)) {
+			if (reg_gpr(reg)) {
 
 				/* TODO: we probably only need (ss) for alu
 				 * instr consuming sfu result.. need to make