/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ /* * Copyright (C) 2014 Rob Clark * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Authors: * Rob Clark */ #include "pipe/p_shader_tokens.h" #include "util/u_math.h" #include "ir3.h" /* * Register Assignment: * * NOTE: currently only works on a single basic block.. need to think * about how multiple basic blocks are going to get scheduled. But * I think I want to re-arrange how blocks work, ie. get rid of the * block nesting thing.. * * NOTE: we could do register coalescing (eliminate moves) as part of * the RA step.. OTOH I think we need to do scheduling before register * assignment. And if we remove a mov that effects scheduling (unless * we leave a placeholder nop, which seems lame), so I'm not really * sure how practical this is to do both in a single stage. But OTOH * I'm not really sure a sane way for the CP stage to realize when it * cannot remove a mov due to multi-register constraints.. * * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has * some ideas to handle array allocation with a more conventional * graph coloring algorithm for register assignment, which might be * a good alternative to the current algo. However afaict it cannot * handle overlapping arrays, which is a scenario that we have to * deal with */ struct ir3_ra_ctx { struct ir3_block *block; enum shader_t type; bool frag_coord; bool frag_face; int cnt; bool error; struct { unsigned base; unsigned size; } arrays[MAX_ARRAYS]; }; #ifdef DEBUG # include "freedreno_util.h" # define ra_debug (fd_mesa_debug & FD_DBG_OPTMSGS) #else # define ra_debug 0 #endif #define ra_dump_list(msg, ir) do { \ if (ra_debug) { \ debug_printf("-- " msg); \ ir3_print(ir); \ } \ } while (0) #define ra_dump_instr(msg, n) do { \ if (ra_debug) { \ debug_printf(">> " msg); \ ir3_print_instr(n); \ } \ } while (0) #define ra_assert(ctx, x) do { \ debug_assert(x); \ if (!(x)) { \ debug_printf("RA: failed assert: %s\n", #x); \ (ctx)->error = true; \ }; \ } while (0) /* sorta ugly way to retrofit half-precision support.. rather than * passing extra param around, just OR in a high bit. All the low * value arithmetic (ie. +/- offset within a contiguous vec4, etc) * will continue to work as long as you don't underflow (and that * would go badly anyways). */ #define REG_HALF 0x8000 #define REG(n, wm, f) (struct ir3_register){ \ .flags = (f), \ .num = (n), \ .wrmask = TGSI_WRITEMASK_ ## wm, \ } /* check that the register exists, is a GPR and is not special (a0/p0) */ static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n) { if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) && !(instr->regs[n]->flags & IR3_REG_SSA)) return instr->regs[n]; return NULL; } /* figure out if an unassigned src register points back to the instr we * are assigning: */ static bool instr_used_by(struct ir3_instruction *instr, struct ir3_register *src) { struct ir3_instruction *src_instr = ssa(src); unsigned i; if (instr == src_instr) return true; if (src_instr && is_meta(src_instr)) for (i = 1; i < src_instr->regs_count; i++) if (instr_used_by(instr, src_instr->regs[i])) return true; return false; } static bool instr_is_output(struct ir3_instruction *instr) { struct ir3_block *block = instr->block; unsigned i; for (i = 0; i < block->noutputs; i++) if (instr == block->outputs[i]) return true; return false; } static void mark_sources(struct ir3_instruction *instr, struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written) { unsigned i; for (i = 1; i < n->regs_count; i++) { struct ir3_register *r = reg_check(n, i); if (r) regmask_set_if_not(liveregs, r, written); /* if any src points back to the instruction(s) in * the block of neighbors that we are assigning then * mark any written (clobbered) registers as live: */ if (instr_used_by(instr, n->regs[i])) regmask_or(liveregs, liveregs, written); } } /* live means read before written */ static void compute_liveregs(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, regmask_t *liveregs) { struct ir3_block *block = ctx->block; regmask_t written; unsigned i; regmask_init(&written); list_for_each_entry (struct ir3_instruction, n, &instr->node, node) { struct ir3_register *r; if (is_meta(n)) continue; /* check first src's read: */ mark_sources(instr, n, liveregs, &written); /* for instructions that write to an array, we need to * capture the dependency on the array elements: */ if (n->fanin) mark_sources(instr, n->fanin, liveregs, &written); /* meta-instructions don't actually get scheduled, * so don't let it's write confuse us.. what we * really care about is when the src to the meta * instr was written: */ if (is_meta(n)) continue; /* then dst written (if assigned already): */ r = reg_check(n, 0); if (r) { /* if an instruction *is* an output, then it is live */ if (!instr_is_output(n)) regmask_set(&written, r); } } /* be sure to account for output registers too: */ for (i = 0; i < block->noutputs; i++) { struct ir3_register *r; if (!block->outputs[i]) continue; r = reg_check(block->outputs[i], 0); if (r) regmask_set_if_not(liveregs, r, &written); } /* if instruction is output, we need a reg that isn't written * before the end.. equiv to the instr_used_by() check above * in the loop body * TODO maybe should follow fanin/fanout? */ if (instr_is_output(instr)) regmask_or(liveregs, liveregs, &written); } static int find_available(regmask_t *liveregs, int size, bool half) { unsigned i; unsigned f = half ? IR3_REG_HALF : 0; for (i = 0; i < MAX_REG - size; i++) { if (!regmask_get(liveregs, ®(i, X, f))) { unsigned start = i++; for (; (i < MAX_REG) && ((i - start) < size); i++) if (regmask_get(liveregs, ®(i, X, f))) break; if ((i - start) >= size) return start; } } assert(0); return -1; } static int alloc_block(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, int size) { struct ir3_register *dst = instr->regs[0]; struct ir3_instruction *n; regmask_t liveregs; unsigned name; /* should only ever be called w/ head of neighbor list: */ debug_assert(!instr->cp.left); regmask_init(&liveregs); for (n = instr; n; n = n->cp.right) compute_liveregs(ctx, n, &liveregs); /* because we do assignment on fanout nodes for wrmask!=0x1, we * need to handle this special case, where the fanout nodes all * appear after one or more of the consumers of the src node: * * 0098:009: sam _, r2.x * 0028:010: mul.f r3.z, r4.x, c13.x * ; we start assigning here for '0098:009: sam'.. but * ; would miss the usage at '0028:010: mul.f' * 0101:009: _meta:fo _, _[0098:009: sam], off=2 */ if (is_meta(instr) && (instr->opc == OPC_META_FO)) compute_liveregs(ctx, instr->regs[1]->instr, &liveregs); name = find_available(&liveregs, size, !!(dst->flags & IR3_REG_HALF)); if (dst->flags & IR3_REG_HALF) name |= REG_HALF; return name; } static type_t half_type(type_t type) { switch (type) { case TYPE_F32: return TYPE_F16; case TYPE_U32: return TYPE_U16; case TYPE_S32: return TYPE_S16; /* instructions may already be fixed up: */ case TYPE_F16: case TYPE_U16: case TYPE_S16: return type; default: assert(0); return ~0; } } /* some instructions need fix-up if dst register is half precision: */ static void fixup_half_instr_dst(struct ir3_instruction *instr) { switch (instr->category) { case 1: /* move instructions */ instr->cat1.dst_type = half_type(instr->cat1.dst_type); break; case 3: switch (instr->opc) { case OPC_MAD_F32: instr->opc = OPC_MAD_F16; break; case OPC_SEL_B32: instr->opc = OPC_SEL_B16; break; case OPC_SEL_S32: instr->opc = OPC_SEL_S16; break; case OPC_SEL_F32: instr->opc = OPC_SEL_F16; break; case OPC_SAD_S32: instr->opc = OPC_SAD_S16; break; /* instructions may already be fixed up: */ case OPC_MAD_F16: case OPC_SEL_B16: case OPC_SEL_S16: case OPC_SEL_F16: case OPC_SAD_S16: break; default: assert(0); break; } break; case 5: instr->cat5.type = half_type(instr->cat5.type); break; } } /* some instructions need fix-up if src register is half precision: */ static void fixup_half_instr_src(struct ir3_instruction *instr) { switch (instr->category) { case 1: /* move instructions */ instr->cat1.src_type = half_type(instr->cat1.src_type); break; } } static void reg_assign(struct ir3_instruction *instr, unsigned r, unsigned name) { struct ir3_register *reg = instr->regs[r]; reg->flags &= ~IR3_REG_SSA; reg->num = name & ~REG_HALF; if (name & REG_HALF) { reg->flags |= IR3_REG_HALF; /* if dst reg being assigned, patch up the instr: */ if (reg == instr->regs[0]) fixup_half_instr_dst(instr); else fixup_half_instr_src(instr); } } static void instr_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned name); static void instr_assign_src(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned r, unsigned name) { struct ir3_register *reg = instr->regs[r]; if (reg->flags & IR3_REG_RELATIV) name += reg->offset; reg_assign(instr, r, name); if (is_meta(instr)) { switch (instr->opc) { case OPC_META_INPUT: /* shader-input does not have a src, only block input: */ debug_assert(instr->regs_count == 2); instr_assign(ctx, instr, name); return; case OPC_META_FO: instr_assign(ctx, instr, name + instr->fo.off); return; case OPC_META_FI: instr_assign(ctx, instr, name - (r - 1)); return; default: break; } } } static void instr_assign_srcs(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned name) { list_for_each_entry (struct ir3_instruction, n, &instr->node, node) { struct ir3_instruction *src; foreach_ssa_src_n(src, i, n) { unsigned r = i + 1; /* skip address / etc (non real sources): */ if (r >= n->regs_count) continue; if (src == instr) instr_assign_src(ctx, n, r, name); } if (ctx->error) break; } } static void instr_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned name) { struct ir3_register *reg = instr->regs[0]; if (reg->flags & IR3_REG_RELATIV) return; /* check if already assigned: */ if (!(reg->flags & IR3_REG_SSA)) { /* ... and if so, sanity check: */ ra_assert(ctx, reg->num == (name & ~REG_HALF)); return; } /* rename this instructions dst register: */ reg_assign(instr, 0, name); /* and rename any subsequent use of result of this instr: */ instr_assign_srcs(ctx, instr, name); /* To simplify the neighbor logic, and to "avoid" dealing with * instructions which write more than one output, we actually * do register assignment for instructions that produce multiple * outputs on the fanout nodes and propagate up the assignment * to the actual instruction: */ if (is_meta(instr) && (instr->opc == OPC_META_FO)) { struct ir3_instruction *src; debug_assert(name >= instr->fo.off); foreach_ssa_src(src, instr) instr_assign(ctx, src, name - instr->fo.off); } } /* check neighbor list to see if it is already partially (or completely) * assigned, in which case register block is already allocated and we * just need to complete the assignment: */ static int check_partial_assignment(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) { struct ir3_instruction *n; int off = 0; debug_assert(!instr->cp.left); for (n = instr; n; n = n->cp.right) { struct ir3_register *dst = n->regs[0]; if ((n->depth != DEPTH_UNUSED) && !(dst->flags & IR3_REG_SSA)) { int name = dst->num - off; debug_assert(name >= 0); return name; } off++; } return -1; } /* allocate register name(s) for a list of neighboring instructions; * instr should point to leftmost neighbor (head of list) */ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) { struct ir3_instruction *n; struct ir3_register *dst; int name; debug_assert(!instr->cp.left); if (instr->regs_count == 0) return; dst = instr->regs[0]; /* For indirect dst, take the register assignment from the * fanin and propagate it forward. */ if (dst->flags & IR3_REG_RELATIV) { /* NOTE can be grouped, if for example outputs: * for now disable cp if indirect writes */ instr_alloc_and_assign(ctx, instr->fanin); dst->num += instr->fanin->regs[0]->num; dst->flags &= ~IR3_REG_SSA; instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num); return; } /* for instructions w/ fanouts, do the actual register assignment * on the group of fanout neighbor nodes and propagate the reg * name back up to the texture instruction. */ if (dst->wrmask != 0x1) return; name = check_partial_assignment(ctx, instr); /* allocate register(s): */ if (name >= 0) { /* already partially assigned, just finish the job */ } else if (reg_gpr(dst)) { int size; /* number of consecutive registers to assign: */ size = ir3_neighbor_count(instr); if (dst->wrmask != 0x1) size = MAX2(size, ffs(~dst->wrmask) - 1); name = alloc_block(ctx, instr, size); } else if (dst->flags & IR3_REG_ADDR) { debug_assert(!instr->cp.right); dst->flags &= ~IR3_REG_ADDR; name = regid(REG_A0, 0) | REG_HALF; } else { debug_assert(!instr->cp.right); /* predicate register (p0).. etc */ name = regid(REG_P0, 0); debug_assert(dst->num == name); } ra_assert(ctx, name >= 0); for (n = instr; n && !ctx->error; n = n->cp.right) { instr_assign(ctx, n, name); name++; } } static void instr_assign_array(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) { struct ir3_instruction *src; int name, aid = instr->fi.aid; if (ctx->arrays[aid].base == ~0) { int size = instr->regs_count - 1; ctx->arrays[aid].base = alloc_block(ctx, instr, size); ctx->arrays[aid].size = size; } name = ctx->arrays[aid].base; foreach_ssa_src_n(src, i, instr) { unsigned r = i + 1; /* skip address / etc (non real sources): */ if (r >= instr->regs_count) break; instr_assign(ctx, src, name); name++; } } static bool block_ra(struct ir3_block *block, void *state) { struct ir3_ra_ctx *ctx = state; ra_dump_list("-------\n", block->shader); /* first pass, assign arrays: */ list_for_each_entry (struct ir3_instruction, n, &block->instr_list, node) { if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) { debug_assert(!n->cp.left); /* don't think this should happen */ ra_dump_instr("ASSIGN ARRAY: ", n); instr_assign_array(ctx, n); ra_dump_list("-------\n", block->shader); } if (ctx->error) return false; } list_for_each_entry (struct ir3_instruction, n, &block->instr_list, node) { ra_dump_instr("ASSIGN: ", n); instr_alloc_and_assign(ctx, ir3_neighbor_first(n)); ra_dump_list("-------\n", block->shader); if (ctx->error) return false; } return true; } static int shader_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) { /* frag shader inputs get pre-assigned, since we have some * constraints/unknowns about setup for some of these regs: */ if (ctx->type == SHADER_FRAGMENT) { unsigned i = 0, j; if (ctx->frag_face && (i < block->ninputs) && block->inputs[i]) { /* if we have frag_face, it gets hr0.x */ instr_assign(ctx, block->inputs[i], REG_HALF | 0); i += 4; } for (j = 0; i < block->ninputs; i++, j++) if (block->inputs[i]) instr_assign(ctx, block->inputs[i], j); } block_ra(block, ctx); return ctx->error ? -1 : 0; } static bool block_mark_dst(struct ir3_block *block, void *state) { list_for_each_entry (struct ir3_instruction, n, &block->instr_list, node) if (n->regs_count > 0) n->regs[0]->flags |= IR3_REG_SSA; return true; } int ir3_block_ra(struct ir3_block *block, enum shader_t type, bool frag_coord, bool frag_face) { struct ir3_ra_ctx ctx = { .block = block, .type = type, .frag_coord = frag_coord, .frag_face = frag_face, }; int ret; memset(&ctx.arrays, ~0, sizeof(ctx.arrays)); /* mark dst registers w/ SSA flag so we can see which * have been assigned so far: * NOTE: we really should set SSA flag consistently on * every dst register in the frontend. */ block_mark_dst(block, &ctx); ir3_clear_mark(block->shader); ret = shader_ra(&ctx, block); return ret; }