mesa/src/gallium/drivers/r600/sb/sb_pass.h

/*
 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors:
 *      Vadim Girlin
 */

#ifndef SB_PASS_H_
#define SB_PASS_H_

#include <stack>

namespace r600_sb {

class pass {
protected:
	sb_context &ctx;
	shader &sh;

public:
	pass(shader &s);

	virtual int run();

	virtual ~pass() {}
};

class vpass : public pass {

public:

	vpass(shader &s) : pass(s) {}

	virtual int init();
	virtual int done();

	virtual int run();
	virtual void run_on(container_node &n);

	virtual bool visit(node &n, bool enter);
	virtual bool visit(container_node &n, bool enter);
	virtual bool visit(alu_group_node &n, bool enter);
	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(alu_packed_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);
	virtual bool visit(region_node &n, bool enter);
	virtual bool visit(repeat_node &n, bool enter);
	virtual bool visit(depart_node &n, bool enter);
	virtual bool visit(if_node &n, bool enter);
	virtual bool visit(bb_node &n, bool enter);

};

class rev_vpass : public vpass {

public:
	rev_vpass(shader &s) : vpass(s) {}

	virtual void run_on(container_node &n);
};


// =================== PASSES

class bytecode;

class bc_dump : public vpass {
	using vpass::visit;

	uint32_t *bc_data;
	unsigned ndw;

	unsigned id;

	unsigned new_group, group_index;

public:

	bc_dump(shader &s, bytecode *bc = NULL);

	bc_dump(shader &s, uint32_t *bc_ptr, unsigned ndw) :
		vpass(s), bc_data(bc_ptr), ndw(ndw), id(), new_group(), group_index() {}

	virtual int init();
	virtual int done();

	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);

	void dump_dw(unsigned dw_id, unsigned count = 2);

	void dump(cf_node& n);
	void dump(alu_node& n);
	void dump(fetch_node& n);
};


class dce_cleanup : public vpass {
	using vpass::visit;

	bool remove_unused;

public:

	dce_cleanup(shader &s) : vpass(s),
		remove_unused(s.dce_flags & DF_REMOVE_UNUSED), nodes_changed(false) {}

	virtual int run();

	virtual bool visit(node &n, bool enter);
	virtual bool visit(alu_group_node &n, bool enter);
	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(alu_packed_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);
	virtual bool visit(region_node &n, bool enter);
	virtual bool visit(container_node &n, bool enter);

private:

	void cleanup_dst(node &n);
	bool cleanup_dst_vec(vvec &vv);

	// Did we alter/remove nodes during a single pass?
	bool nodes_changed;
};


class def_use : public pass {

public:

	def_use(shader &sh) : pass(sh) {}

	virtual int run();
	void run_on(node *n, bool defs);

private:

	void process_uses(node *n);
	void process_defs(node *n, vvec &vv, bool arr_def);
	void process_phi(container_node *c, bool defs, bool uses);
};


class dump : public vpass {
	using vpass::visit;

	int level;

public:

	dump(shader &s) : vpass(s), level(0) {}

	virtual bool visit(node &n, bool enter);
	virtual bool visit(container_node &n, bool enter);
	virtual bool visit(alu_group_node &n, bool enter);
	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(alu_packed_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);
	virtual bool visit(region_node &n, bool enter);
	virtual bool visit(repeat_node &n, bool enter);
	virtual bool visit(depart_node &n, bool enter);
	virtual bool visit(if_node &n, bool enter);
	virtual bool visit(bb_node &n, bool enter);


	static void dump_op(node &n, const char *name);
	static void dump_vec(const vvec & vv);
	static void dump_set(shader &sh, val_set & v);

	static void dump_rels(vvec & vv);

	static void dump_val(value *v);
	static void dump_op(node *n);

	static void dump_op_list(container_node *c);
	static void dump_queue(sched_queue &q);

	static void dump_alu(alu_node *n);

private:

	void indent();

	void dump_common(node &n);
	void dump_flags(node &n);

	void dump_live_values(container_node &n, bool before);
};


// Global Code Motion

class gcm : public pass {

	sched_queue bu_ready[SQ_NUM];
	sched_queue bu_ready_next[SQ_NUM];
	sched_queue bu_ready_early[SQ_NUM];
	sched_queue ready;
	sched_queue ready_above;

	unsigned outstanding_lds_oq;
	container_node pending;

	struct op_info {
		bb_node* top_bb;
		bb_node* bottom_bb;
		op_info() : top_bb(), bottom_bb() {}
	};

	typedef std::map<node*, op_info> op_info_map;

	typedef std::map<node*, unsigned> nuc_map;

	op_info_map op_map;
	nuc_map uses;

	typedef std::vector<nuc_map> nuc_stack;

	nuc_stack nuc_stk;
	unsigned ucs_level;

	bb_node * bu_bb;

	vvec pending_defs;

	node_list pending_nodes;

	unsigned cur_sq;

	// for register pressure tracking in bottom-up pass
	val_set live;
	int live_count;

	static const int rp_threshold = 100;

	bool pending_exec_mask_update;

public:

	gcm(shader &sh) : pass(sh),
		bu_ready(), bu_ready_next(), bu_ready_early(),
		ready(), outstanding_lds_oq(),
		op_map(), uses(), nuc_stk(1), ucs_level(),
		bu_bb(), pending_defs(), pending_nodes(), cur_sq(),
		live(), live_count(), pending_exec_mask_update() {}

	virtual int run();

private:

	void collect_instructions(container_node *c, bool early_pass);

	void sched_early(container_node *n);
	void td_sched_bb(bb_node *bb);
	bool td_is_ready(node *n);
	void td_release_uses(vvec &v);
	void td_release_val(value *v);
	void td_schedule(bb_node *bb, node *n);

	void sched_late(container_node *n);
	void bu_sched_bb(bb_node *bb);
	void bu_release_defs(vvec &v, bool src);
	void bu_release_phi_defs(container_node *p, unsigned op);
	bool bu_is_ready(node *n);
	void bu_release_val(value *v);
	void bu_release_op(node * n);
	void bu_find_best_bb(node *n, op_info &oi);
	void bu_schedule(container_node *bb, node *n);

	void push_uc_stack();
	void pop_uc_stack();

	void init_def_count(nuc_map &m, container_node &s);
	void init_use_count(nuc_map &m, container_node &s);
	unsigned get_uc_vec(vvec &vv);
	unsigned get_dc_vec(vvec &vv, bool src);

	void add_ready(node *n);

	void dump_uc_stack();

	unsigned real_alu_count(sched_queue &q, unsigned max);

	// check if we have not less than threshold ready alu instructions
	bool check_alu_ready_count(unsigned threshold);
};


class gvn : public vpass {
	using vpass::visit;

public:

	gvn(shader &sh) : vpass(sh) {}

	virtual bool visit(node &n, bool enter);
	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(alu_packed_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);
	virtual bool visit(region_node &n, bool enter);

private:

	void process_op(node &n, bool rewrite = true);

	// returns true if the value was rewritten
	bool process_src(value* &v, bool rewrite);


	void process_alu_src_constants(node &n, value* &v);
};


class if_conversion : public pass {

public:

	if_conversion(shader &sh) : pass(sh) {}

	virtual int run();

	bool run_on(region_node *r);

	void convert_kill_instructions(region_node *r, value *em, bool branch,
	                               container_node *c);

	bool check_and_convert(region_node *r);

	alu_node* convert_phi(value *select, node *phi);

};


class liveness : public rev_vpass {
	using vpass::visit;

	val_set live;
	bool live_changed;

public:

	liveness(shader &s) : rev_vpass(s), live_changed(false) {}

	virtual int init();

	virtual bool visit(node &n, bool enter);
	virtual bool visit(bb_node &n, bool enter);
	virtual bool visit(container_node &n, bool enter);
	virtual bool visit(alu_group_node &n, bool enter);
	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(alu_packed_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);
	virtual bool visit(region_node &n, bool enter);
	virtual bool visit(repeat_node &n, bool enter);
	virtual bool visit(depart_node &n, bool enter);
	virtual bool visit(if_node &n, bool enter);

private:

	void update_interferences();
	void process_op(node &n);

	bool remove_val(value *v);
	bool remove_vec(vvec &v);
	bool process_outs(node& n);
	void process_ins(node& n);

	void process_phi_outs(container_node *phi);
	void process_phi_branch(container_node *phi, unsigned id);

	bool process_maydef(value *v);

	bool add_vec(vvec &vv, bool src);

	void update_src_vec(vvec &vv, bool src);
};


struct bool_op_info {
	bool invert;
	unsigned int_cvt;

	alu_node *n;
};

class peephole : public pass {

public:

	peephole(shader &sh) : pass(sh) {}

	virtual int run();

	void run_on(container_node *c);

	void optimize_cc_op(alu_node *a);

	void optimize_cc_op2(alu_node *a);
	void optimize_CNDcc_op(alu_node *a);

	bool get_bool_op_info(value *b, bool_op_info& bop);
	bool get_bool_flt_to_int_source(alu_node* &a);
	void convert_float_setcc(alu_node *f2i, alu_node *s);
};


class psi_ops : public rev_vpass {
	using rev_vpass::visit;

public:

	psi_ops(shader &s) : rev_vpass(s) {}

	virtual bool visit(node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);

	bool try_inline(node &n);
	bool try_reduce(node &n);
	bool eliminate(node &n);

	void unpredicate(node *n);
};


// check correctness of the generated code, e.g.:
// - expected source operand value is the last value written to its gpr,
// - all arguments of phi node should be allocated to the same gpr,
// TODO other tests
class ra_checker : public pass {

	typedef std::map<sel_chan, value *> reg_value_map;

	typedef std::vector<reg_value_map> regmap_stack;

	regmap_stack rm_stack;
	unsigned rm_stk_level;

	value* prev_dst[5];

public:

	ra_checker(shader &sh) : pass(sh), rm_stk_level(0), prev_dst() {}

	virtual int run();

	void run_on(container_node *c);

	void dump_error(const error_info &e);
	void dump_all_errors();

private:

	reg_value_map& rmap() { return rm_stack[rm_stk_level]; }

	void push_stack();
	void pop_stack();

	// when going out of the alu clause, values in the clause temporary gprs,
	// AR, predicate values, PS/PV are destroyed
	void kill_alu_only_regs();
	void error(node *n, unsigned id, std::string msg);

	void check_phi_src(container_node *p, unsigned id);
	void process_phi_dst(container_node *p);
	void check_alu_group(alu_group_node *g);
	void process_op_dst(node *n);
	void check_op_src(node *n);
	void check_src_vec(node *n, unsigned id, vvec &vv, bool src);
	void check_value_gpr(node *n, unsigned id, value *v);
};

// =======================================


class ra_coalesce : public pass {

public:

	ra_coalesce(shader &sh) : pass(sh) {}

	virtual int run();
};


// =======================================

class ra_init : public pass {

public:

	ra_init(shader &sh) : pass(sh), prev_chans() {

		// The parameter below affects register channels distribution.
		// For cayman (VLIW-4) we're trying to distribute the channels
		// uniformly, this means significantly better alu slots utilization
		// at the expense of higher gpr usage. Hopefully this will improve
		// performance, though it has to be proven with real benchmarks yet.
		// For VLIW-5 this method could also slightly improve slots
		// utilization, but increased register pressure seems more significant
		// and overall performance effect is negative according to some
		// benchmarks, so it's not used currently. Basically, VLIW-5 doesn't
		// really need it because trans slot (unrestricted by register write
		// channel) allows to consume most deviations from uniform channel
		// distribution.
		// Value 3 means that for new allocation we'll use channel that differs
		// from 3 last used channels. 0 for VLIW-5 effectively turns this off.

		ra_tune = sh.get_ctx().is_cayman() ? 3 : 0;
	}

	virtual int run();

private:

	unsigned prev_chans;
	unsigned ra_tune;

	void add_prev_chan(unsigned chan);
	unsigned get_preferable_chan_mask();

	bool ra_node(container_node *c);
	bool process_op(node *n);

	bool color(value *v);

	void color_bs_constraint(ra_constraint *c);

	void assign_color(value *v, sel_chan c);
	void alloc_arrays();
};

// =======================================

class ra_split : public pass {

public:

	ra_split(shader &sh) : pass(sh) {}

	virtual int run();

	void split(container_node *n);
	void split_op(node *n);
	void split_alu_packed(alu_packed_node *n);
	void split_vector_inst(node *n);

	void split_packed_ins(alu_packed_node *n);

#if 0
	void split_pinned_outs(node *n);
#endif

	void split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz);

	void split_phi_src(container_node *loc, container_node *c, unsigned id,
	                   bool loop);
	void split_phi_dst(node *loc, container_node *c, bool loop);
	void init_phi_constraints(container_node *c);
};


class ssa_prepare : public vpass {
	using vpass::visit;

	typedef std::vector<val_set> vd_stk;
	vd_stk stk;

	unsigned level;

public:
	ssa_prepare(shader &s) : vpass(s), level(0) {}

	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);
	virtual bool visit(region_node &n, bool enter);
	virtual bool visit(repeat_node &n, bool enter);
	virtual bool visit(depart_node &n, bool enter);

private:

	void push_stk() {
		++level;
		if (level + 1 > stk.size())
			stk.resize(level+1);
		else
			stk[level].clear();
	}
	void pop_stk() {
		assert(level);
		--level;
		stk[level].add_set(stk[level + 1]);
	}

	void add_defs(node &n);

	val_set & cur_set() { return stk[level]; }

	container_node* create_phi_nodes(int count);
};

class ssa_rename : public vpass {
	using vpass::visit;

	typedef sb_map<value*, unsigned> def_map;

	def_map def_count;
	def_map lds_oq_count;
	def_map lds_rw_count;
	std::stack<def_map> rename_stack;
	std::stack<def_map> rename_lds_oq_stack;
	std::stack<def_map> rename_lds_rw_stack;

	typedef std::map<uint32_t, value*> val_map;
	val_map values;

public:

	ssa_rename(shader &s) : vpass(s) {}

	virtual int init();

	virtual bool visit(container_node &n, bool enter);
	virtual bool visit(node &n, bool enter);
	virtual bool visit(alu_group_node &n, bool enter);
	virtual bool visit(cf_node &n, bool enter);
	virtual bool visit(alu_node &n, bool enter);
	virtual bool visit(alu_packed_node &n, bool enter);
	virtual bool visit(fetch_node &n, bool enter);
	virtual bool visit(region_node &n, bool enter);
	virtual bool visit(repeat_node &n, bool enter);
	virtual bool visit(depart_node &n, bool enter);
	virtual bool visit(if_node &n, bool enter);

private:

	void push(node *phi);
	void pop();

	unsigned get_index(def_map& m, value* v);
	void set_index(def_map& m, value* v, unsigned index);
	unsigned new_index(def_map& m, value* v);

	value* rename_use(node *n, value* v);
	value* rename_def(node *def, value* v);

	void rename_src_vec(node *n, vvec &vv, bool src);
	void rename_dst_vec(node *def, vvec &vv, bool set_def);

	void rename_src(node *n);
	void rename_dst(node *n);

	void rename_phi_args(container_node *phi, unsigned op, bool def);

	void rename_virt(node *n);
	void rename_virt_val(node *n, value *v);
};

class bc_finalizer : public pass {

	cf_node *last_export[EXP_TYPE_COUNT];
	cf_node *last_cf;

	unsigned ngpr;
	unsigned nstack;

public:

	bc_finalizer(shader &sh) : pass(sh), last_export(), last_cf(), ngpr(),
		nstack() {}

	virtual int run();

	void finalize_loop(region_node *r);
	void finalize_if(region_node *r);

	void run_on(container_node *c);

	void insert_rv6xx_load_ar_workaround(alu_group_node *b4);
	void finalize_alu_group(alu_group_node *g, node *prev_node);
	bool finalize_alu_src(alu_group_node *g, alu_node *a, alu_group_node *prev_node);

	void emit_set_grad(fetch_node* f);
	void finalize_fetch(fetch_node *f);

	void finalize_cf(cf_node *c);

	sel_chan translate_kcache(cf_node *alu, value *v);

	void update_ngpr(unsigned gpr);
	void update_nstack(region_node *r, unsigned add = 0);

	unsigned get_stack_depth(node *n, unsigned &loops, unsigned &ifs,
	                         unsigned add = 0);

	void cf_peephole();

private:
	void copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg_start);
	void emit_set_texture_offsets(fetch_node &f);
};


} // namespace r600_sb

#endif /* SB_PASS_H_ */