freedreno/ir3: split out delay helpers

We're going to want these also for a post-RA sched pass. And also to split nop stuffing out into it's own pass. Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3569>
2019-12-18 11:10:12 -08:00 · 2019-12-18 11:10:12 -08:00 · c803c662f9
parent 54c795f829
commit c803c662f9
5 changed files with 350 additions and 183 deletions
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -1113,10 +1113,16 @@ static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
 void ir3_print(struct ir3 *ir);
 void ir3_print_instr(struct ir3_instruction *instr);
-/* depth calculation: */
+/* delay calculation: */
 struct ir3_shader_variant;
 int ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n);
 unsigned ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
 		unsigned maxd, bool pred);
 unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 		bool soft, bool pred);
 /* depth calculation: */
 struct ir3_shader_variant;
 void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
 void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so);
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@ -0,0 +1,337 @@
 /*
 * Copyright (C) 2019 Google, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors:
 *    Rob Clark <robclark@freedesktop.org>
 */
 #include "ir3.h"
 /*
 * Helpers to figure out the necessary delay slots between instructions.  Used
 * both in scheduling pass(es) and the final pass to insert any required nop's
 * so that the shader program is valid.
 *
 * Note that this needs to work both pre and post RA, so we can't assume ssa
 * src iterators work.
 */
 /* generally don't count false dependencies, since this can just be
 * something like a barrier, or SSBO store.  The exception is array
 * dependencies if the assigner is an array write and the consumer
 * reads the same array.
 */
 static bool
 ignore_dep(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n)
 {
 	if (!__is_false_dep(consumer, n))
 		return false;
 	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
 		struct ir3_register *dst = assigner->regs[0];
 		struct ir3_register *src;
 		debug_assert(dst->flags & IR3_REG_ARRAY);
 		foreach_src (src, consumer) {
 			if ((src->flags & IR3_REG_ARRAY) &&
 					(dst->array.id == src->array.id)) {
 				return false;
 			}
 		}
 	}
 	return true;
 }
 /* calculate required # of delay slots between the instruction that
 * assigns a value and the one that consumes
 */
 int
 ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n)
 {
 	if (ignore_dep(assigner, consumer, n))
 		return 0;
 	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
 	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
 	 * handled with sync bits
 	 */
 	if (is_meta(assigner) || is_meta(consumer))
 		return 0;
 	if (writes_addr(assigner))
 		return 6;
 	/* handled via sync flags: */
 	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
 		return 0;
 	/* assigner must be alu: */
 	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 			is_mem(consumer)) {
 		return 6;
 	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
 			(n == 3)) {
 		/* special case, 3rd src to cat3 not required on first cycle */
 		return 1;
 	} else {
 		return 3;
 	}
 }
 static bool
 count_instruction(struct ir3_instruction *n)
 {
 	/* NOTE: don't count branch/jump since we don't know yet if they will
 	 * be eliminated later in resolve_jumps().. really should do that
 	 * earlier so we don't have this constraint.
 	 */
 	return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
 }
 /**
 * @block: the block to search in, starting from end; in first pass,
 *    this will be the block the instruction would be inserted into
 *    (but has not yet, ie. it only contains already scheduled
 *    instructions).  For intra-block scheduling (second pass), this
 *    would be one of the predecessor blocks.
 * @instr: the instruction to search for
 * @maxd:  max distance, bail after searching this # of instruction
 *    slots, since it means the instruction we are looking for is
 *    far enough away
 * @pred:  if true, recursively search into predecessor blocks to
 *    find the worst case (shortest) distance (only possible after
 *    individual blocks are all scheduled)
 */
 unsigned
 ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
 		unsigned maxd, bool pred)
 {
 	unsigned d = 0;
 	/* Note that this relies on incrementally building up the block's
 	 * instruction list.. but this is how scheduling and nopsched
 	 * work.
 	 */
 	foreach_instr_rev (n, &block->instr_list) {
 		if ((n == instr) || (d >= maxd))
 			return MIN2(maxd, d + n->nop);
 		if (count_instruction(n))
 			d = MIN2(maxd, d + 1 + n->repeat + n->nop);
 	}
 	/* if coming from a predecessor block, assume it is assigned far
 	 * enough away.. we'll fix up later.
 	 */
 	if (!pred)
 		return maxd;
 	if (pred && (block->data != block)) {
 		/* Search into predecessor blocks, finding the one with the
 		 * shortest distance, since that will be the worst case
 		 */
 		unsigned min = maxd - d;
 		/* (ab)use block->data to prevent recursion: */
 		block->data = block;
 		set_foreach (block->predecessors, entry) {
 			struct ir3_block *pred = (struct ir3_block *)entry->key;
 			unsigned n;
 			n = ir3_distance(pred, instr, min, pred);
 			min = MIN2(min, n);
 		}
 		block->data = NULL;
 		d += min;
 	}
 	return d;
 }
 /* calculate delay for specified src: */
 static unsigned
 delay_calc_srcn(struct ir3_block *block,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer,
 		unsigned srcn, bool soft, bool pred)
 {
 	unsigned delay = 0;
 	if (is_meta(assigner)) {
 		struct ir3_register *src;
 		foreach_src (src, assigner) {
 			unsigned d;
 			if (!src->instr)
 				continue;
 			d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
 			delay = MAX2(delay, d);
 		}
 	} else {
 		if (soft) {
 			if (is_sfu(assigner)) {
 				delay = 4;
 			} else {
 				delay = ir3_delayslots(assigner, consumer, srcn);
 			}
 		} else {
 			delay = ir3_delayslots(assigner, consumer, srcn);
 		}
 		delay -= ir3_distance(block, assigner, delay, pred);
 	}
 	return delay;
 }
 static struct ir3_instruction *
 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
 {
 	unsigned d = 0;
 	/* Note that this relies on incrementally building up the block's
 	 * instruction list.. but this is how scheduling and nopsched
 	 * work.
 	 */
 	foreach_instr_rev (n, &block->instr_list) {
 		if (d >= maxd)
 			return NULL;
 		if (count_instruction(n))
 			d++;
 		if (dest_regs(n) == 0)
 			continue;
 		/* note that a dest reg will never be an immediate */
 		if (n->regs[0]->array.id == array_id)
 			return n;
 	}
 	return NULL;
 }
 /* like list_length() but only counts instructions which count in the
 * delay determination:
 */
 static unsigned
 count_block_delay(struct ir3_block *block)
 {
 	unsigned delay = 0;
 	foreach_instr (n, &block->instr_list) {
 		if (!count_instruction(n))
 			continue;
 		delay++;
 	}
 	return delay;
 }
 static unsigned
 delay_calc_array(struct ir3_block *block, unsigned array_id,
 		struct ir3_instruction *consumer, unsigned srcn,
 		bool soft, bool pred, unsigned maxd)
 {
 	struct ir3_instruction *assigner;
 	assigner = find_array_write(block, array_id, maxd);
 	if (assigner)
 		return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
 	if (!pred)
 		return 0;
 	unsigned len = count_block_delay(block);
 	if (maxd <= len)
 		return 0;
 	maxd -= len;
 	if (block->data == block) {
 		/* we have a loop, return worst case: */
 		return maxd;
 	}
 	/* If we need to search into predecessors, find the one with the
 	 * max delay.. the resulting delay is that minus the number of
 	 * counted instructions in this block:
 	 */
 	unsigned max = 0;
 	/* (ab)use block->data to prevent recursion: */
 	block->data = block;
 	set_foreach (block->predecessors, entry) {
 		struct ir3_block *pred = (struct ir3_block *)entry->key;
 		unsigned delay =
 			delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
 		max = MAX2(max, delay);
 	}
 	block->data = NULL;
 	if (max < len)
 		return 0;
 	return max - len;
 }
 /**
 * Calculate delay for instruction (maximum of delay for all srcs):
 *
 * @soft:  If true, add additional delay for situations where they
 *    would not be strictly required because a sync flag would be
 *    used (but scheduler would prefer to schedule some other
 *    instructions first to avoid stalling on sync flag)
 * @pred:  If true, recurse into predecessor blocks
 */
 unsigned
 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 		bool soft, bool pred)
 {
 	unsigned delay = 0;
 	struct ir3_register *src;
 	foreach_src_n (src, i, instr) {
 		unsigned d = 0;
 		if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
 			d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
 		} else if (src->instr) {
 			d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
 		}
 		delay = MAX2(delay, d);
 	}
 	if (instr->address) {
 		unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
 		delay = MAX2(delay, d);
 	}
 	return delay;
 }
--- a/src/freedreno/ir3/ir3_depth.c
+++ b/src/freedreno/ir3/ir3_depth.c
@ -48,72 +48,6 @@
 * blocks depth sorted list, which is used by the scheduling pass.
 */
 /* generally don't count false dependencies, since this can just be
 * something like a barrier, or SSBO store.  The exception is array
 * dependencies if the assigner is an array write and the consumer
 * reads the same array.
 */
 static bool
 ignore_dep(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n)
 {
 	if (!__is_false_dep(consumer, n))
 		return false;
 	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
 		struct ir3_register *dst = assigner->regs[0];
 		struct ir3_register *src;
 		debug_assert(dst->flags & IR3_REG_ARRAY);
 		foreach_src(src, consumer) {
 			if ((src->flags & IR3_REG_ARRAY) &&
 					(dst->array.id == src->array.id)) {
 				return false;
 			}
 		}
 	}
 	return true;
 }
 /* calculate required # of delay slots between the instruction that
 * assigns a value and the one that consumes
 */
 int ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n)
 {
 	if (ignore_dep(assigner, consumer, n))
 		return 0;
 	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
 	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
 	 * handled with sync bits
 	 */
 	if (is_meta(assigner) || is_meta(consumer))
 		return 0;
 	if (writes_addr(assigner))
 		return 6;
 	/* handled via sync flags: */
 	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
 		return 0;
 	/* assigner must be alu: */
 	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 			is_mem(consumer)) {
 		return 6;
 	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
 			(n == 3)) {
 		/* special case, 3rd src to cat3 not required on first cycle */
 		return 1;
 	} else {
 		return 3;
 	}
 }
 void
 ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
 {
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@ -265,117 +265,6 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 	return d;
 }
 /**
 * @block: the block to search in, starting from end; in first pass,
 *    this will be the block the instruction would be inserted into
 *    (but has not yet, ie. it only contains already scheduled
 *    instructions).  For intra-block scheduling (second pass), this
 *    would be one of the predecessor blocks.
 * @instr: the instruction to search for
 * @maxd:  max distance, bail after searching this # of instruction
 *    slots, since it means the instruction we are looking for is
 *    far enough away
 * @pred:  if true, recursively search into predecessor blocks to
 *    find the worst case (shortest) distance (only possible after
 *    individual blocks are all scheduled
 */
 static unsigned
 distance(struct ir3_block *block, struct ir3_instruction *instr,
 		unsigned maxd, bool pred)
 {
 	unsigned d = 0;
 	foreach_instr_rev (n, &block->instr_list) {
 		if ((n == instr) || (d >= maxd))
 			return d;
 		/* NOTE: don't count branch/jump since we don't know yet if they will
 		 * be eliminated later in resolve_jumps().. really should do that
 		 * earlier so we don't have this constraint.
 		 */
 		if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
 			d++;
 	}
 	/* if coming from a predecessor block, assume it is assigned far
 	 * enough away.. we'll fix up later.
 	 */
 	if (!pred)
 		return maxd;
 	if (pred && (block->data != block)) {
 		/* Search into predecessor blocks, finding the one with the
 		 * shortest distance, since that will be the worst case
 		 */
 		unsigned min = maxd - d;
 		/* (ab)use block->data to prevent recursion: */
 		block->data = block;
 		set_foreach(block->predecessors, entry) {
 			struct ir3_block *pred = (struct ir3_block *)entry->key;
 			unsigned n;
 			n = distance(pred, instr, min, pred);
 			min = MIN2(min, n);
 		}
 		block->data = NULL;
 		d += min;
 	}
 	return d;
 }
 /* calculate delay for specified src: */
 static unsigned
 delay_calc_srcn(struct ir3_block *block,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer,
 		unsigned srcn, bool soft, bool pred)
 {
 	unsigned delay = 0;
 	if (is_meta(assigner)) {
 		struct ir3_instruction *src;
 		foreach_ssa_src(src, assigner) {
 			unsigned d;
 			d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
 			delay = MAX2(delay, d);
 		}
 	} else {
 		if (soft) {
 			if (is_sfu(assigner)) {
 				delay = 4;
 			} else {
 				delay = ir3_delayslots(assigner, consumer, srcn);
 			}
 		} else {
 			delay = ir3_delayslots(assigner, consumer, srcn);
 		}
 		delay -= distance(block, assigner, delay, pred);
 	}
 	return delay;
 }
 /* calculate delay for instruction (maximum of delay for all srcs): */
 static unsigned
 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 		bool soft, bool pred)
 {
 	unsigned delay = 0;
 	struct ir3_instruction *src;
 	foreach_ssa_src_n(src, i, instr) {
 		unsigned d;
 		d = delay_calc_srcn(block, src, instr, i, soft, pred);
 		delay = MAX2(delay, d);
 	}
 	return delay;
 }
 struct ir3_sched_notes {
 	/* there is at least one kill which could be scheduled, except
 	 * for unscheduled bary.f's:
@ -658,7 +547,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 				continue;
 		}
-		int rank = delay_calc(ctx->block, candidate, soft, false);
+		int rank = ir3_delay_calc(ctx->block, candidate, soft, false);
 		/* if too many live values, prioritize instructions that reduce the
 		 * number of live values:
@ -827,7 +716,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			instr = find_eligible_instr(ctx, &notes, false);
 		if (instr) {
-			unsigned delay = delay_calc(ctx->block, instr, false, false);
+			unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 			d("delay=%u", delay);
@ -886,7 +775,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 		debug_assert(ctx->pred);
 		debug_assert(block->condition);
-		delay -= distance(ctx->block, ctx->pred, delay, false);
+		delay -= ir3_distance(ctx->block, ctx->pred, delay, false);
 		while (delay > 0) {
 			ir3_NOP(block);
@ -944,7 +833,7 @@ sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 		set_foreach(block->predecessors, entry) {
 			struct ir3_block *pred = (struct ir3_block *)entry->key;
-			unsigned d = delay_calc(pred, instr, false, true);
+			unsigned d = ir3_delay_calc(pred, instr, false, true);
 			delay = MAX2(d, delay);
 		}
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@ -54,6 +54,7 @@ libfreedreno_ir3_files = files(
  'ir3_context.c',
  'ir3_context.h',
  'ir3_cp.c',
  'ir3_delay.c',
  'ir3_depth.c',
  'ir3_group.c',
  'ir3_image.c',