From a087e9f27f088c4bd00b2fddb65629c83d404860 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 2 Oct 2012 15:01:24 -0700 Subject: [PATCH] i965/fs: Reduce the interference between payload regs and virtual GRFs. Improves performance of the Lightsmark penumbra shadows scene by 15.7% +/- 1.0% (n=15), by eliminating register spilling. (tested by smashing the list of scenes to have all other scenes have 0 duration -- includes additional rendering of scene description text that normally doesn't appear in that scene) v2: Allow allocation of all but g0/g1 of the payload. v3: Pull count_to_loop_end() out to a helper function. Reviewed-by: Kenneth Graunke (v2, recommended v3) --- src/mesa/drivers/dri/i965/brw_fs.h | 2 + .../drivers/dri/i965/brw_fs_reg_allocate.cpp | 169 ++++++++++++++++-- 2 files changed, 152 insertions(+), 19 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index a14c2ee7326..581eb4e2472 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -233,6 +233,8 @@ public: void assign_urb_setup(); bool assign_regs(); void assign_regs_trivial(); + void setup_payload_interference(struct ra_graph *g, int payload_reg_count, + int first_payload_node); int choose_spill_reg(struct ra_graph *g); void spill_reg(int spill_reg); void split_virtual_grfs(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index b9431024559..1d7c7733e8e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -160,30 +160,162 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width, int base_reg_count) ra_set_finalize(brw->wm.regs, NULL); } +int +count_to_loop_end(fs_inst *do_inst) +{ + int depth = 1; + int ip = ip; + for (fs_inst *inst = (fs_inst *)do_inst->next; + depth > 0; + inst = (fs_inst *)inst->next) { + switch (inst->opcode) { + case BRW_OPCODE_DO: + depth++; + break; + case BRW_OPCODE_WHILE: + depth--; + break; + default: + break; + } + ip++; + } + return ip; +} + /** * Sets up interference between thread payload registers and the virtual GRFs * to be allocated for program temporaries. + * + * We want to be able to reallocate the payload for our virtual GRFs, notably + * because the setup coefficients for a full set of 16 FS inputs takes up 8 of + * our 128 registers. + * + * The layout of the payload registers is: + * + * 0..nr_payload_regs-1: fixed function setup (including bary coordinates). + * nr_payload_regs..nr_payload_regs+curb_read_lengh-1: uniform data + * nr_payload_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. + * + * And we have payload_node_count nodes covering these registers in order + * (note that in 16-wide, a node is two registers). */ -static void -brw_setup_payload_interference(struct ra_graph *g, - int payload_reg_count, - int first_payload_node, - int reg_node_count) +void +fs_visitor::setup_payload_interference(struct ra_graph *g, + int payload_node_count, + int first_payload_node) { - for (int i = 0; i < payload_reg_count; i++) { - /* Mark each payload reg node as being allocated to its physical register. + int reg_width = c->dispatch_width / 8; + int loop_depth = 0; + int loop_end_ip = 0; + + int payload_last_use_ip[payload_node_count]; + memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip)); + int ip = 0; + foreach_list(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + switch (inst->opcode) { + case BRW_OPCODE_DO: + loop_depth++; + + /* Since payload regs are deffed only at the start of the shader + * execution, any uses of the payload within a loop mean the live + * interval extends to the end of the outermost loop. Find the ip of + * the end now. + */ + if (loop_depth == 1) + loop_end_ip = ip + count_to_loop_end(inst); + break; + case BRW_OPCODE_WHILE: + loop_depth--; + break; + default: + break; + } + + int use_ip; + if (loop_depth > 0) + use_ip = loop_end_ip; + else + use_ip = ip; + + /* Note that UNIFORM args have been turned into FIXED_HW_REG by + * assign_curbe_setup(), and interpolation uses fixed hardware regs from + * the start (see interp_reg()). + */ + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == FIXED_HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width; + if (node_nr >= payload_node_count) + continue; + + payload_last_use_ip[node_nr] = use_ip; + } + } + + /* Special case instructions which have extra implied registers used. */ + switch (inst->opcode) { + case FS_OPCODE_FB_WRITE: + /* We could omit this for the !inst->header_present case, except that + * the simulator apparently incorrectly reads from g0/g1 instead of + * sideband. It also really freaks out driver developers to see g0 + * used in unusual places, so just always reserve it. + */ + payload_last_use_ip[0 / reg_width] = use_ip; + payload_last_use_ip[1 / reg_width] = use_ip; + break; + case FS_OPCODE_DISCARD: + payload_last_use_ip[1 / reg_width] = use_ip; + break; + + case FS_OPCODE_LINTERP: + /* On gen6+ in 16-wide, there are 4 adjacent registers (so 2 nodes) + * used by PLN's sourcing of the deltas, while we list only the first + * two in the arguments (1 node). Pre-gen6, the deltas are computed + * in normal VGRFs. + */ + if (intel->gen >= 6) { + int delta_x_arg = 0; + if (inst->src[delta_x_arg].file == FIXED_HW_REG && + inst->src[delta_x_arg].fixed_hw_reg.file == + BRW_GENERAL_REGISTER_FILE) { + int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr / + reg_width) + 1; + assert(sechalf_node < payload_node_count); + payload_last_use_ip[sechalf_node] = use_ip; + } + } + break; + + default: + break; + } + + ip++; + } + + for (int i = 0; i < payload_node_count; i++) { + /* Mark the payload node as interfering with any virtual grf that is + * live between the start of the program and our last use of the payload + * node. + */ + for (int j = 0; j < this->virtual_grf_count; j++) { + if (this->virtual_grf_def[j] <= payload_last_use_ip[i] || + this->virtual_grf_use[j] <= payload_last_use_ip[i]) { + ra_add_node_interference(g, first_payload_node + i, j); + } + } + } + + for (int i = 0; i < payload_node_count; i++) { + /* Mark each payload node as being allocated to its physical register. * * The alternative would be to have per-physical-register classes, which * would just be silly. */ ra_set_node_reg(g, first_payload_node + i, i); - - /* For now, just mark each payload node as interfering with every other - * node to be allocated. - */ - for (int j = 0; j < reg_node_count; j++) { - ra_add_node_interference(g, first_payload_node + i, j); - } } } @@ -198,7 +330,7 @@ fs_visitor::assign_regs() */ int reg_width = c->dispatch_width / 8; int hw_reg_mapping[this->virtual_grf_count]; - int payload_reg_count = (ALIGN(this->first_non_payload_grf, reg_width) / + int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) / reg_width); int base_reg_count = max_grf / reg_width; @@ -208,7 +340,7 @@ fs_visitor::assign_regs() int node_count = this->virtual_grf_count; int first_payload_node = node_count; - node_count += payload_reg_count; + node_count += payload_node_count; struct ra_graph *g = ra_alloc_interference_graph(brw->wm.regs, node_count); for (int i = 0; i < this->virtual_grf_count; i++) { @@ -240,8 +372,7 @@ fs_visitor::assign_regs() } } - brw_setup_payload_interference(g, payload_reg_count, first_payload_node, - this->virtual_grf_count); + setup_payload_interference(g, payload_node_count, first_payload_node); if (!ra_allocate_no_spills(g)) { /* Failed to allocate registers. Spill a reg, and the caller will @@ -268,7 +399,7 @@ fs_visitor::assign_regs() * regs in the register classes back down to real hardware reg * numbers. */ - this->grf_used = payload_reg_count * reg_width; + this->grf_used = payload_node_count * reg_width; for (int i = 0; i < this->virtual_grf_count; i++) { int reg = ra_get_node_reg(g, i);