From 579473f8f838aade82ad58949902910fa5fe15e3 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 16 Feb 2014 07:35:20 -0500 Subject: [PATCH] freedreno/a3xx/compiler: handle kill properly (new compiler) Since 'kill' does not produce a result, the new compiler was happily optimizing them out. We need to instead track 'kill's similar to outputs. But since there is no non-predicated kill instruction, (and for flattend if/else we do want them to be predicated), we need to track the topmost branch condition on the stack and use that as src arg to the kill. For a kill at the topmost level, we have to generate an immediate 1.0 to feed into the cmps.f for setting the predicate register. Signed-off-by: Rob Clark --- .../drivers/freedreno/a3xx/fd3_compiler.c | 113 +++++++++++++++--- src/gallium/drivers/freedreno/a3xx/ir3.h | 10 +- .../drivers/freedreno/a3xx/ir3_depth.c | 2 +- src/gallium/drivers/freedreno/a3xx/ir3_ra.c | 6 +- 4 files changed, 105 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index b1ed2e09457..30c2b51743d 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -103,9 +103,16 @@ struct fd3_compile_context { /* stack of branch instructions that mark (potentially nested) * branch if/else/loop/etc */ - struct ir3_instruction *branch[16]; + struct { + struct ir3_instruction *instr, *cond; + bool inv; /* true iff in else leg of branch */ + } branch[16]; unsigned int branch_count; + /* list of kill instructions: */ + struct ir3_instruction *kill[16]; + unsigned int kill_count; + /* used when dst is same as one of the src, to avoid overwriting a * src element before the remaining scalar instructions that make * up the vector operation @@ -135,6 +142,7 @@ compile_init(struct fd3_compile_context *ctx, struct fd3_shader_stateobj *so, ctx->next_inloc = 8; ctx->num_internal_temps = 0; ctx->branch_count = 0; + ctx->kill_count = 0; ctx->block = NULL; ctx->current_instr = NULL; ctx->num_output_updates = 0; @@ -274,6 +282,9 @@ push_block(struct fd3_compile_context *ctx) ntmp = SCALAR_REGS(TEMPORARY); ntmp += 4 * 4; + nout = SCALAR_REGS(OUTPUT); + nin = SCALAR_REGS(INPUT); + /* for outermost block, 'inputs' are the actual shader INPUT * register file. Reads from INPUT registers always go back to * top block. For nested blocks, 'inputs' is used to track any @@ -284,17 +295,19 @@ push_block(struct fd3_compile_context *ctx) /* NOTE: fragment shaders actually have two inputs (r0.xy, the * position) */ - nin = SCALAR_REGS(INPUT); - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { nin = MAX2(2, nin); + nout += ARRAY_SIZE(ctx->kill); + } } else { nin = ntmp; } - nout = SCALAR_REGS(OUTPUT); - block = ir3_block_create(ctx->ir, ntmp, nin, nout); + if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block) + block->noutputs -= ARRAY_SIZE(ctx->kill); + block->parent = ctx->block; ctx->block = block; @@ -1246,15 +1259,23 @@ trans_cmp(const struct instr_translater *t, */ static void -push_branch(struct fd3_compile_context *ctx, struct ir3_instruction *instr) +push_branch(struct fd3_compile_context *ctx, bool inv, + struct ir3_instruction *instr, struct ir3_instruction *cond) { - ctx->branch[ctx->branch_count++] = instr; + unsigned int idx = ctx->branch_count++; + compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch)); + ctx->branch[idx].instr = instr; + ctx->branch[idx].inv = inv; + /* else side of branch has same condition: */ + if (!inv) + ctx->branch[idx].cond = cond; } static struct ir3_instruction * pop_branch(struct fd3_compile_context *ctx) { - return ctx->branch[--ctx->branch_count]; + unsigned int idx = --ctx->branch_count; + return ctx->branch[idx].instr; } static void @@ -1262,7 +1283,7 @@ trans_if(const struct instr_translater *t, struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst) { - struct ir3_instruction *instr; + struct ir3_instruction *instr, *cond; struct tgsi_src_register *src = &inst->Src[0].Register; struct tgsi_dst_register tmp_dst; struct tgsi_src_register *tmp_src; @@ -1274,25 +1295,22 @@ trans_if(const struct instr_translater *t, if (is_const(src)) src = get_unconst(ctx, src); - /* cmps.f.eq tmp0, b, {0.0} */ + /* cmps.f.ne tmp0, b, {0.0} */ instr = instr_create(ctx, 2, OPC_CMPS_F); add_dst_reg(ctx, instr, &tmp_dst, 0); add_src_reg(ctx, instr, src, src->SwizzleX); add_src_reg(ctx, instr, &constval, constval.SwizzleX); - instr->cat2.condition = IR3_COND_EQ; + instr->cat2.condition = IR3_COND_NE; - /* add.s tmp0, tmp0, -1 */ - instr = instr_create(ctx, 2, OPC_ADD_S); - add_dst_reg(ctx, instr, &tmp_dst, TGSI_SWIZZLE_X); - add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); - ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -1; + compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */ + cond = instr->regs[1]->instr; /* meta:flow tmp0 */ instr = instr_create(ctx, -1, OPC_META_FLOW); ir3_reg_create(instr, 0, 0); /* dummy dst */ add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); - push_branch(ctx, instr); + push_branch(ctx, false, instr, cond); instr->flow.if_block = push_block(ctx); } @@ -1310,7 +1328,7 @@ trans_else(const struct instr_translater *t, compile_assert(ctx, (instr->category == -1) && (instr->opc == OPC_META_FLOW)); - push_branch(ctx, instr); + push_branch(ctx, true, instr, NULL); instr->flow.else_block = push_block(ctx); } @@ -1483,6 +1501,53 @@ trans_endif(const struct instr_translater *t, // TODO maybe we want to compact block->inputs? } +/* + * Kill / Kill-if + */ + +static void +trans_kill(const struct instr_translater *t, + struct fd3_compile_context *ctx, + struct tgsi_full_instruction *inst) +{ + struct ir3_instruction *instr, *immed, *cond = NULL; + bool inv = false; + + switch (t->tgsi_opc) { + case TGSI_OPCODE_KILL: + /* unconditional kill, use enclosing if condition: */ + if (ctx->branch_count > 0) { + unsigned int idx = ctx->branch_count - 1; + cond = ctx->branch[idx].cond; + inv = ctx->branch[idx].inv; + } else { + cond = create_immed(ctx, 1.0); + } + + break; + } + + compile_assert(ctx, cond); + + immed = create_immed(ctx, 0.0); + + /* cmps.f.ne p0.x, cond, {0.0} */ + instr = instr_create(ctx, 2, OPC_CMPS_F); + instr->cat2.condition = IR3_COND_NE; + ir3_reg_create(instr, regid(REG_P0, 0), 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed; + cond = instr; + + /* kill p0.x */ + instr = instr_create(ctx, 0, OPC_KILL); + instr->cat0.inv = inv; + ir3_reg_create(instr, 0, 0); /* dummy dst */ + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond; + + ctx->kill[ctx->kill_count++] = instr; +} + /* * Handlers for TGSI instructions which do have 1:1 mapping to native * instructions: @@ -1672,7 +1737,7 @@ static const struct instr_translater translaters[TGSI_OPCODE_LAST] = { INSTR(ELSE, trans_else), INSTR(ENDIF, trans_endif), INSTR(END, instr_cat0, .opc = OPC_END), - INSTR(KILL, instr_cat0, .opc = OPC_KILL), + INSTR(KILL, trans_kill, .opc = OPC_KILL), }; static fd3_semantic @@ -1944,6 +2009,16 @@ fd3_compile_shader(struct fd3_shader_stateobj *so, compile_instructions(&ctx); + /* at this point, we want the kill's in the outputs array too, + * so that they get scheduled (since they have no dst).. we've + * already ensured that the array is big enough in push_block(): + */ + if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { + struct ir3_block *block = ctx.block; + for (i = 0; i < ctx.kill_count; i++) + block->outputs[block->noutputs++] = ctx.kill[i]; + } + if (fd_mesa_debug & FD_DBG_OPTDUMP) compile_dump(&ctx); diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h index ccd3b0b54b4..9c57a653553 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ b/src/gallium/drivers/freedreno/a3xx/ir3.h @@ -307,6 +307,11 @@ static inline uint32_t reg_comp(struct ir3_register *reg) return reg->num & 0x3; } +static inline bool is_flow(struct ir3_instruction *instr) +{ + return (instr->category == 0); +} + static inline bool is_alu(struct ir3_instruction *instr) { return (1 <= instr->category) && (instr->category <= 3); @@ -336,11 +341,6 @@ static inline bool is_meta(struct ir3_instruction *instr) return (instr->category == -1); } -static inline bool is_gpr(struct ir3_register *reg) -{ - return !(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)); -} - /* TODO combine is_gpr()/reg_gpr().. */ static inline bool reg_gpr(struct ir3_register *r) { diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c index 580ae08da2c..452257884df 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c @@ -68,7 +68,7 @@ int ir3_delayslots(struct ir3_instruction *assigner, return 0; /* assigner must be alu: */ - if (is_sfu(consumer) || is_tex(consumer)) { + if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer)) { return 8; } else if ((consumer->category == 3) && is_mad(consumer->opc) && (n == 2)) { diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c index 6c868e21791..22c58e6c9ad 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c @@ -456,6 +456,10 @@ static void ir3_instr_ra(struct ir3_ra_ctx *ctx, if (instr->regs_count == 0) return; + /* skip writes to a0, p0, etc */ + if (!reg_gpr(instr->regs[0])) + return; + /* if we've already visited this instruction, bail now: */ if (instr->flags & IR3_INSTR_MARK) return; @@ -493,7 +497,7 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) for (i = 1; i < n->regs_count; i++) { struct ir3_register *reg = n->regs[i]; - if (is_gpr(reg)) { + if (reg_gpr(reg)) { /* TODO: we probably only need (ss) for alu * instr consuming sfu result.. need to make