mesa/src/gallium/drivers/r600/sb/sb_ir.cpp

552 lines
12 KiB
C++
Raw Permalink Normal View History

/*
* Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Authors:
* Vadim Girlin
*/
#include "sb_bc.h"
#include "sb_shader.h"
#include "sb_pass.h"
namespace r600_sb {
bool node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool container_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool alu_group_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool alu_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool cf_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool fetch_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool region_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool repeat_node::accept(vpass& p, bool enter) {
return p.visit(*this, enter);
}
bool depart_node::accept(vpass& p, bool enter) {
return p.visit(*this, enter);
}
bool if_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool bb_node::accept(vpass& p, bool enter) { return p.visit(*this, enter); }
bool alu_packed_node::accept(vpass& p, bool enter) {
return p.visit(*this, enter);
}
void alu_packed_node::init_args(bool repl) {
alu_node *p = static_cast<alu_node*>(first);
assert(p->is_valid());
while (p) {
dst.insert(dst.end(), p->dst.begin(), p->dst.end());
src.insert(src.end(), p->src.begin(), p->src.end());
p = static_cast<alu_node*>(p->next);
}
value *replicated_value = NULL;
for (vvec::iterator I = dst.begin(), E = dst.end(); I != E; ++I) {
value *v = *I;
if (v) {
if (repl) {
if (replicated_value)
v->assign_source(replicated_value);
else
replicated_value = v;
}
v->def = this;
}
}
}
void container_node::insert_node_before(node* s, node* n) {
if (s->prev) {
node *sp = s->prev;
sp->next = n;
n->prev = sp;
n->next = s;
s->prev = n;
} else {
n->next = s;
s->prev = n;
first = n;
}
n->parent = this;
}
void container_node::insert_node_after(node* s, node* n) {
if (s->next) {
node *sn = s->next;
sn->prev = n;
n->next = sn;
n->prev = s;
s->next = n;
} else {
n->prev = s;
s->next = n;
last = n;
}
n->parent = this;
}
void container_node::move(iterator b, iterator e) {
assert(b != e);
container_node *source_container = b->parent;
node *l = source_container->cut(b, e);
first = last = l;
first->parent = this;
while (last->next) {
last = last->next;
last->parent = this;
}
}
node* container_node::cut(iterator b, iterator e) {
assert(!*b || b->parent == this);
assert(!*e || e->parent == this);
assert(b != e);
if (b->prev) {
b->prev->next = *e;
} else {
first = *e;
}
if (*e) {
e->prev->next = NULL;
e->prev = b->prev;
} else {
last->next = NULL;
last = b->prev;
}
b->prev = NULL;
return *b;
}
unsigned container_node::count() {
unsigned c = 0;
node *t = first;
while (t) {
t = t->next;
c++;
}
return c;
}
void container_node::remove_node(node *n) {
if (n->prev)
n->prev->next = n->next;
else
first = n->next;
if (n->next)
n->next->prev = n->prev;
else
last = n->prev;
n->parent = NULL;
}
void container_node::expand(container_node *n) {
if (!n->empty()) {
node *e0 = n->first;
node *e1 = n->last;
e0->prev = n->prev;
if (e0->prev) {
e0->prev->next = e0;
} else {
first = e0;
}
e1->next = n->next;
if (e1->next)
e1->next->prev = e1;
else
last = e1;
do {
e0->parent = this;
e0 = e0->next;
} while (e0 != e1->next);
} else
remove_node(n);
}
void container_node::push_back(node *n) {
if (last) {
last->next = n;
n->next = NULL;
n->prev = last;
last = n;
} else {
assert(!first);
first = last = n;
n->prev = n->next = NULL;
}
n->parent = this;
}
void container_node::push_front(node *n) {
if (first) {
first->prev = n;
n->prev = NULL;
n->next = first;
first = n;
} else {
assert(!last);
first = last = n;
n->prev = n->next = NULL;
}
n->parent = this;
}
void node::insert_before(node* n) {
parent->insert_node_before(this, n);
}
void node::insert_after(node* n) {
parent->insert_node_after(this, n);
}
void node::replace_with(node* n) {
n->prev = prev;
n->next = next;
n->parent = parent;
if (prev)
prev->next = n;
if (next)
next->prev = n;
if (parent->first == this)
parent->first = n;
if (parent->last == this)
parent->last = n;
parent = NULL;
next = prev = NULL;
}
void container_node::expand() {
parent->expand(this);
}
void node::remove() {parent->remove_node(this);
}
r600/sb: Fix loop optimization related hangs on eg Make sure unused ops and their references are removed, prior to entering the GCM (global code motion) pass, to stop GCM from breaking the loop logic and thus hanging the GPU. Turns out, that sb has problems with loops and node optimizations regarding associative folding: - the global code motion (gcm) pass moves ops up a loop level/basic block until they've fulfilled their total usage count - if there are ops folded into others, the usage count won't be fulfilled and thus the op moved way up to the top - within GCM the op would be visited and their deps would be moved alongside it, to fulfill the src constaints - in a loop, an unused op is moved out of the loop and GCM would move the src value ops up as well - now here arises the problem: if the loop counter is one of the src values it would get moved up as well, the loop break condition would never get hit and the shader turn into an endless loop, resulting in the GPU hanging and being reset A reduced (albeit nonsense) piglit example would be: [require] GLSL >= 1.20 [fragment shader] uniform int SIZE; uniform vec4 lights[512]; void main() { float x = 0; for(int i = 0; i < SIZE; i++) x += lights[2*i+1].x; } [test] uniform int SIZE 1 draw rect -1 -1 2 2 Which gets optimized to: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 42 dw ===== 1 gprs ===== 2 stack ========================================= ALU 3 @24 1 y: MOV R0.y, 0 t: MULLO_UINT R0.w, [0x00000002 2.8026e-45].x, R0.z LOOP_START_DX10 @22 PUSH @6 ALU 1 @30 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 2 @32 3 x: ADD_INT R0.x, R0.w, [0x00000002 2.8026e-45].x TEX 1 @36 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 1 @40 4 y: ADD R0.y, R0.y, R0.x LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Notice R0.z being the loop counter/break condition relevant register and being never incremented at all. Also some of the loop content has been moved out of it, to fulfill the requirements for the one unused op. With a debug build of mesa this would produce an error like error at : PRED_SETGE_INT __, __, EM.2, R1.x.2||FP@R0.z, C0.x : operand value R1.x.2||FP@R0.z was not previously written to its gpr and the compilation would fail due to this. On a release build it gets passed to the GPU. When using this patch, the loop remains intact: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 48 dw ===== 1 gprs ===== 2 stack ========================================= ALU 2 @24 1 y: MOV R0.y, 0 z: MOV R0.z, 0 LOOP_START_DX10 @22 PUSH @6 ALU 1 @28 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 4 @30 3 t: MULLO_UINT T0.x, [0x00000002 2.8026e-45].x, R0.z 4 x: ADD_INT R0.x, T0.x, [0x00000002 2.8026e-45].x TEX 1 @40 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 2 @44 5 y: ADD R0.y, R0.y, R0.x z: ADD_INT R0.z, R0.z, 1 LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Piglit: ./piglit summary console -d results/*_gpu_noglx name: unpatched_gpu_noglx patched_gpu_noglx ---- ------------------- ----------------- pass: 18016 18021 fail: 748 743 crash: 7 7 skip: 1124 1124 timeout: 0 0 warn: 13 13 incomplete: 0 0 dmesg-warn: 0 0 dmesg-fail: 0 0 changes: 0 5 fixes: 0 5 regressions: 0 0 total: 19908 19908 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94900 Tested-by: Heiko Przybyl <lil_tux@web.de> Tested-on: Barts PRO HD6850 Signed-off-by: Heiko Przybyl <lil_tux@web.de> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
2016-11-20 13:42:28 +00:00
value_hash node::hash_src() const {
value_hash h = 12345;
for (int k = 0, e = src.size(); k < e; ++k) {
value *s = src[k];
if (s)
h ^= (s->hash());
}
return h;
}
r600/sb: Fix loop optimization related hangs on eg Make sure unused ops and their references are removed, prior to entering the GCM (global code motion) pass, to stop GCM from breaking the loop logic and thus hanging the GPU. Turns out, that sb has problems with loops and node optimizations regarding associative folding: - the global code motion (gcm) pass moves ops up a loop level/basic block until they've fulfilled their total usage count - if there are ops folded into others, the usage count won't be fulfilled and thus the op moved way up to the top - within GCM the op would be visited and their deps would be moved alongside it, to fulfill the src constaints - in a loop, an unused op is moved out of the loop and GCM would move the src value ops up as well - now here arises the problem: if the loop counter is one of the src values it would get moved up as well, the loop break condition would never get hit and the shader turn into an endless loop, resulting in the GPU hanging and being reset A reduced (albeit nonsense) piglit example would be: [require] GLSL >= 1.20 [fragment shader] uniform int SIZE; uniform vec4 lights[512]; void main() { float x = 0; for(int i = 0; i < SIZE; i++) x += lights[2*i+1].x; } [test] uniform int SIZE 1 draw rect -1 -1 2 2 Which gets optimized to: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 42 dw ===== 1 gprs ===== 2 stack ========================================= ALU 3 @24 1 y: MOV R0.y, 0 t: MULLO_UINT R0.w, [0x00000002 2.8026e-45].x, R0.z LOOP_START_DX10 @22 PUSH @6 ALU 1 @30 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 2 @32 3 x: ADD_INT R0.x, R0.w, [0x00000002 2.8026e-45].x TEX 1 @36 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 1 @40 4 y: ADD R0.y, R0.y, R0.x LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Notice R0.z being the loop counter/break condition relevant register and being never incremented at all. Also some of the loop content has been moved out of it, to fulfill the requirements for the one unused op. With a debug build of mesa this would produce an error like error at : PRED_SETGE_INT __, __, EM.2, R1.x.2||FP@R0.z, C0.x : operand value R1.x.2||FP@R0.z was not previously written to its gpr and the compilation would fail due to this. On a release build it gets passed to the GPU. When using this patch, the loop remains intact: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 48 dw ===== 1 gprs ===== 2 stack ========================================= ALU 2 @24 1 y: MOV R0.y, 0 z: MOV R0.z, 0 LOOP_START_DX10 @22 PUSH @6 ALU 1 @28 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 4 @30 3 t: MULLO_UINT T0.x, [0x00000002 2.8026e-45].x, R0.z 4 x: ADD_INT R0.x, T0.x, [0x00000002 2.8026e-45].x TEX 1 @40 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 2 @44 5 y: ADD R0.y, R0.y, R0.x z: ADD_INT R0.z, R0.z, 1 LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Piglit: ./piglit summary console -d results/*_gpu_noglx name: unpatched_gpu_noglx patched_gpu_noglx ---- ------------------- ----------------- pass: 18016 18021 fail: 748 743 crash: 7 7 skip: 1124 1124 timeout: 0 0 warn: 13 13 incomplete: 0 0 dmesg-warn: 0 0 dmesg-fail: 0 0 changes: 0 5 fixes: 0 5 regressions: 0 0 total: 19908 19908 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94900 Tested-by: Heiko Przybyl <lil_tux@web.de> Tested-on: Barts PRO HD6850 Signed-off-by: Heiko Przybyl <lil_tux@web.de> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
2016-11-20 13:42:28 +00:00
value_hash node::hash() const {
if (parent && parent->subtype == NST_LOOP_PHI_CONTAINER)
return 47451;
return hash_src() ^ (subtype << 13) ^ (type << 3);
}
void r600_sb::container_node::append_from(container_node* c) {
if (!c->first)
return;
node *b = c->first;
if (last) {
last->next = c->first;
last->next->prev = last;
} else {
first = c->first;
}
last = c->last;
c->first = NULL;
c->last = NULL;
while (b) {
b->parent = this;
b = b->next;
}
}
bool node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); }
bool container_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); }
bool alu_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); }
bool alu_packed_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); }
bool fetch_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); }
bool cf_node::fold_dispatch(expr_handler* ex) { return ex->fold(*this); }
unsigned alu_packed_node::get_slot_mask() {
unsigned mask = 0;
for (node_iterator I = begin(), E = end(); I != E; ++I)
mask |= 1 << static_cast<alu_node*>(*I)->bc.slot;
return mask;
}
void alu_packed_node::update_packed_items(sb_context &ctx) {
vvec::iterator SI(src.begin()), DI(dst.begin());
assert(first);
alu_node *c = static_cast<alu_node*>(first);
unsigned flags = c->bc.op_ptr->flags;
unsigned slot_flags = c->bc.slot_flags;
// fixup dst for instructions that replicate output
if (((flags & AF_REPL) && slot_flags == AF_4V) ||
(ctx.is_cayman() && slot_flags == AF_S)) {
value *swp[4] = {};
unsigned chan;
for (vvec::iterator I2 = dst.begin(), E2 = dst.end();
I2 != E2; ++I2) {
value *v = *I2;
if (v) {
chan = v->get_final_chan();
assert(!swp[chan] || swp[chan] == v);
swp[chan] = v;
}
}
chan = 0;
for (vvec::iterator I2 = dst.begin(), E2 = dst.end();
I2 != E2; ++I2, ++chan) {
*I2 = swp[chan];
}
}
for (node_iterator I = begin(), E = end(); I != E; ++I) {
alu_node *n = static_cast<alu_node*>(*I);
assert(n);
for (vvec::iterator I2 = n->src.begin(), E2 = n->src.end();
I2 != E2; ++I2, ++SI) {
*I2 = *SI;
}
for (vvec::iterator I2 = n->dst.begin(), E2 = n->dst.end();
I2 != E2; ++I2, ++DI) {
*I2 = *DI;
}
}
}
bool node::is_cf_op(unsigned op) {
if (!is_cf_inst())
return false;
cf_node *c = static_cast<cf_node*>(this);
return c->bc.op == op;
}
bool node::is_alu_op(unsigned op) {
if (!is_alu_inst())
return false;
alu_node *c = static_cast<alu_node*>(this);
return c->bc.op == op;
}
bool node::is_fetch_op(unsigned op) {
if (!is_fetch_inst())
return false;
fetch_node *c = static_cast<fetch_node*>(this);
return c->bc.op == op;
}
bool node::is_mova() {
if (!is_alu_inst())
return false;
alu_node *a = static_cast<alu_node*>(this);
return (a->bc.op_ptr->flags & AF_MOVA);
}
bool node::is_pred_set() {
if (!is_alu_inst())
return false;
alu_node *a = static_cast<alu_node*>(this);
return (a->bc.op_ptr->flags & AF_ANY_PRED);
}
unsigned node::cf_op_flags() {
assert(is_cf_inst());
cf_node *c = static_cast<cf_node*>(this);
return c->bc.op_ptr->flags;
}
unsigned node::alu_op_flags() {
assert(is_alu_inst());
alu_node *c = static_cast<alu_node*>(this);
return c->bc.op_ptr->flags;
}
unsigned node::fetch_op_flags() {
assert(is_fetch_inst());
fetch_node *c = static_cast<fetch_node*>(this);
return c->bc.op_ptr->flags;
}
unsigned node::alu_op_slot_flags() {
assert(is_alu_inst());
alu_node *c = static_cast<alu_node*>(this);
return c->bc.slot_flags;
}
region_node* node::get_parent_region() {
node *p = this;
while ((p = p->parent))
if (p->is_region())
return static_cast<region_node*>(p);
return NULL;
}
unsigned container_node::real_alu_count() {
unsigned c = 0;
node *t = first;
while (t) {
if (t->is_alu_inst())
++c;
else if (t->is_alu_packed())
c += static_cast<container_node*>(t)->count();
t = t->next;
}
return c;
}
void container_node::collect_stats(node_stats& s) {
for (node_iterator I = begin(), E = end(); I != E; ++I) {
node *n = *I;
if (n->is_container()) {
static_cast<container_node*>(n)->collect_stats(s);
}
if (n->is_alu_inst()) {
++s.alu_count;
alu_node *a = static_cast<alu_node*>(n);
if (a->bc.op_ptr->flags & AF_KILL)
++s.alu_kill_count;
else if (a->is_copy_mov())
++s.alu_copy_mov_count;
if (a->uses_ar())
s.uses_ar = true;
} else if (n->is_fetch_inst())
++s.fetch_count;
else if (n->is_cf_inst())
++s.cf_count;
else if (n->is_region()) {
++s.region_count;
region_node *r = static_cast<region_node*>(n);
if(r->is_loop())
++s.loop_count;
if (r->phi)
s.phi_count += r->phi->count();
if (r->loop_phi)
s.loop_phi_count += r->loop_phi->count();
}
else if (n->is_depart())
++s.depart_count;
else if (n->is_repeat())
++s.repeat_count;
else if (n->is_if())
++s.if_count;
}
}
void region_node::expand_depart(depart_node *d) {
depart_vec::iterator I = departs.begin() + d->dep_id, E;
I = departs.erase(I);
E = departs.end();
while (I != E) {
--(*I)->dep_id;
++I;
}
d->expand();
}
void region_node::expand_repeat(repeat_node *r) {
repeat_vec::iterator I = repeats.begin() + r->rep_id - 1, E;
I = repeats.erase(I);
E = repeats.end();
while (I != E) {
--(*I)->rep_id;
++I;
}
r->expand();
}
void node_stats::dump() {
sblog << " alu_count : " << alu_count << "\n";
sblog << " alu_kill_count : " << alu_kill_count << "\n";
sblog << " alu_copy_mov_count : " << alu_copy_mov_count << "\n";
sblog << " cf_count : " << cf_count << "\n";
sblog << " fetch_count : " << fetch_count << "\n";
sblog << " region_count : " << region_count << "\n";
sblog << " loop_count : " << loop_count << "\n";
sblog << " phi_count : " << phi_count << "\n";
sblog << " loop_phi_count : " << loop_phi_count << "\n";
sblog << " depart_count : " << depart_count << "\n";
sblog << " repeat_count : " << repeat_count << "\n";
sblog << " if_count : " << if_count << "\n";
}
unsigned alu_node::interp_param() {
if (!(bc.op_ptr->flags & AF_INTERP))
return 0;
unsigned param;
if (bc.op_ptr->src_count == 2) {
param = src[1]->select.sel();
} else {
param = src[0]->select.sel();
}
return param + 1;
}
alu_group_node* alu_node::get_alu_group_node() {
node *p = parent;
if (p) {
if (p->subtype == NST_ALU_PACKED_INST) {
assert(p->parent && p->parent->subtype == NST_ALU_GROUP);
p = p->parent;
}
return static_cast<alu_group_node*>(p);
}
return NULL;
}
} // namespace r600_sb