mesa/src/gallium/drivers/r600/sb/sb_ir.h

1190 lines
26 KiB
C
Raw Permalink Normal View History

/*
* Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Authors:
* Vadim Girlin
*/
#ifndef R600_SB_IR_H_
#define R600_SB_IR_H_
#include <algorithm>
#include <stdint.h>
#include <vector>
#include <set>
#include <algorithm>
#include "sb_bc.h"
namespace r600_sb {
enum special_regs {
SV_ALU_PRED = 128,
SV_EXEC_MASK,
SV_AR_INDEX,
SV_VALID_MASK,
SV_GEOMETRY_EMIT,
SV_LDS_RW,
SV_LDS_OQA,
SV_LDS_OQB,
SV_SCRATCH
};
class node;
class value;
class shader;
struct sel_chan
{
unsigned id;
sel_chan(unsigned id = 0) : id(id) {}
sel_chan(unsigned sel, unsigned chan) : id(((sel << 2) | chan) + 1) {}
unsigned sel() const { return sel(id); }
unsigned chan() const {return chan(id); }
operator unsigned() const {return id;}
static unsigned sel(unsigned idx) { return (idx-1) >> 2; }
static unsigned chan(unsigned idx) { return (idx-1) & 3; }
sel_chan(unsigned bank, unsigned index,
unsigned chan, alu_kcache_index_mode index_mode)
: id(sel_chan((bank << 12) | index | ((unsigned)index_mode << 28), chan).id) {}
unsigned kcache_index_mode() const { return sel() >> 28; }
unsigned kcache_sel() const { return sel() & 0x0fffffffu; }
unsigned kcache_bank() const { return kcache_sel() >> 12; }
};
inline sb_ostream& operator <<(sb_ostream& o, sel_chan r) {
static const char * ch = "xyzw";
o << r.sel() << "." << ch[r.chan()];
return o;
}
typedef std::vector<value*> vvec;
class sb_pool {
protected:
static const unsigned SB_POOL_ALIGN = 8;
static const unsigned SB_POOL_DEFAULT_BLOCK_SIZE = (1 << 16);
typedef std::vector<void*> block_vector;
unsigned block_size;
block_vector blocks;
unsigned total_size;
public:
sb_pool(unsigned block_size = SB_POOL_DEFAULT_BLOCK_SIZE)
: block_size(block_size), blocks(), total_size() {}
virtual ~sb_pool() { free_all(); }
void* allocate(unsigned sz);
protected:
void free_all();
};
template <typename V, typename Comp = std::less<V> >
class sb_set {
typedef std::vector<V> data_vector;
data_vector vec;
public:
typedef typename data_vector::iterator iterator;
typedef typename data_vector::const_iterator const_iterator;
sb_set() : vec() {}
~sb_set() { }
iterator begin() { return vec.begin(); }
iterator end() { return vec.end(); }
const_iterator begin() const { return vec.begin(); }
const_iterator end() const { return vec.end(); }
void add_set(const sb_set& s) {
data_vector t;
t.reserve(vec.size() + s.vec.size());
std::set_union(vec.begin(), vec.end(), s.vec.begin(), s.vec.end(),
std::inserter(t, t.begin()), Comp());
vec.swap(t);
}
iterator lower_bound(const V& v) {
return std::lower_bound(vec.begin(), vec.end(), v, Comp());
}
std::pair<iterator, bool> insert(const V& v) {
iterator P = lower_bound(v);
if (P != vec.end() && is_equal(*P, v))
return std::make_pair(P, false);
return std::make_pair(vec.insert(P, v), true);
}
unsigned erase(const V& v) {
iterator P = lower_bound(v);
if (P == vec.end() || !is_equal(*P, v))
return 0;
vec.erase(P);
return 1;
}
void clear() { vec.clear(); }
bool empty() { return vec.empty(); }
bool is_equal(const V& v1, const V& v2) {
return !Comp()(v1, v2) && !Comp()(v2, v1);
}
iterator find(const V& v) {
iterator P = lower_bound(v);
return (P != vec.end() && is_equal(*P, v)) ? P : vec.end();
}
unsigned size() { return vec.size(); }
void erase(iterator I) { vec.erase(I); }
};
template <typename K, typename V, typename KComp = std::less<K> >
class sb_map {
typedef std::pair<K, V> datatype;
struct Comp {
bool operator()(const datatype &v1, const datatype &v2) {
return KComp()(v1.first, v2.first);
}
};
typedef sb_set<datatype, Comp> dataset;
dataset set;
public:
sb_map() : set() {}
typedef typename dataset::iterator iterator;
iterator begin() { return set.begin(); }
iterator end() { return set.end(); }
void clear() { set.clear(); }
V& operator[](const K& key) {
datatype P = std::make_pair(key, V());
iterator F = set.find(P);
if (F == set.end()) {
return (*(set.insert(P).first)).second;
} else {
return (*F).second;
}
}
std::pair<iterator, bool> insert(const datatype& d) {
return set.insert(d);
}
iterator find(const K& key) {
return set.find(std::make_pair(key, V()));
}
unsigned erase(const K& key) {
return set.erase(std::make_pair(key, V()));
}
void erase(iterator I) {
set.erase(I);
}
};
class sb_bitset {
typedef uint32_t basetype;
static const unsigned bt_bits = sizeof(basetype) << 3;
std::vector<basetype> data;
unsigned bit_size;
public:
sb_bitset() : data(), bit_size() {}
bool get(unsigned id);
void set(unsigned id, bool bit = true);
bool set_chk(unsigned id, bool bit = true);
void clear();
void resize(unsigned size);
unsigned size() { return bit_size; }
unsigned find_bit(unsigned start = 0);
void swap(sb_bitset & bs2);
bool operator==(const sb_bitset &bs2);
bool operator!=(const sb_bitset &bs2) { return !(*this == bs2); }
sb_bitset& operator|=(const sb_bitset &bs2) {
if (bit_size < bs2.bit_size) {
resize(bs2.bit_size);
}
for (unsigned i = 0, c = std::min(data.size(), bs2.data.size()); i < c;
++i) {
data[i] |= bs2.data[i];
}
return *this;
}
sb_bitset& operator&=(const sb_bitset &bs2);
sb_bitset& mask(const sb_bitset &bs2);
friend sb_bitset operator|(const sb_bitset &b1, const sb_bitset &b2) {
sb_bitset nbs(b1);
nbs |= b2;
return nbs;
}
};
enum value_kind {
VLK_REG,
VLK_REL_REG,
VLK_SPECIAL_REG,
VLK_TEMP,
VLK_CONST,
VLK_KCACHE,
VLK_PARAM,
VLK_SPECIAL_CONST,
VLK_UNDEF
};
class sb_value_pool : protected sb_pool {
unsigned aligned_elt_size;
public:
sb_value_pool(unsigned elt_size, unsigned block_elts = 256)
: sb_pool(block_elts * (aligned_elt_size = ((elt_size +
SB_POOL_ALIGN - 1) & ~(SB_POOL_ALIGN - 1)))) {}
virtual ~sb_value_pool() { delete_all(); }
value* create(value_kind k, sel_chan regid, unsigned ver);
value* operator[](unsigned id) {
unsigned offset = id * aligned_elt_size;
unsigned block_id;
if (offset < block_size) {
block_id = 0;
} else {
block_id = offset / block_size;
offset = offset % block_size;
}
return (value*)((char*)blocks[block_id] + offset);
}
unsigned size() { return total_size / aligned_elt_size; }
protected:
void delete_all();
};
class sb_value_set {
sb_bitset bs;
public:
sb_value_set() : bs() {}
class iterator {
sb_value_pool &vp;
sb_value_set *s;
unsigned nb;
public:
iterator(shader &sh, sb_value_set *s, unsigned nb = 0);
iterator& operator++() {
if (nb + 1 < s->bs.size())
nb = s->bs.find_bit(nb + 1);
else
nb = s->bs.size();
return *this;
}
bool operator !=(const iterator &i) {
return s != i.s || nb != i.nb;
}
bool operator ==(const iterator &i) { return !(*this != i); }
value* operator *() {
return vp[nb];
}
};
iterator begin(shader &sh) {
return iterator(sh, this, bs.size() ? bs.find_bit(0) : 0);
}
iterator end(shader &sh) { return iterator(sh, this, bs.size()); }
bool add_set_checked(sb_value_set & s2);
void add_set(sb_value_set & s2) {
if (bs.size() < s2.bs.size())
bs.resize(s2.bs.size());
bs |= s2.bs;
}
void remove_set(sb_value_set & s2);
bool add_vec(vvec &vv);
bool add_val(value *v);
bool contains(value *v);
bool remove_val(value *v);
bool remove_vec(vvec &vv);
void clear();
bool empty();
};
typedef sb_value_set val_set;
struct gpr_array {
sel_chan base_gpr; // original gpr
sel_chan gpr; // assigned by regalloc
unsigned array_size;
gpr_array(sel_chan base_gpr, unsigned array_size) : base_gpr(base_gpr),
array_size(array_size) {}
unsigned hash() { return (base_gpr << 10) * array_size; }
val_set interferences;
vvec refs;
bool is_dead();
};
typedef std::vector<gpr_array*> regarray_vec;
enum value_flags {
VLF_UNDEF = (1 << 0),
VLF_READONLY = (1 << 1),
VLF_DEAD = (1 << 2),
VLF_PIN_REG = (1 << 3),
VLF_PIN_CHAN = (1 << 4),
// opposite to alu clause local value - goes through alu clause boundary
// (can't use temp gpr, can't recolor in the alu scheduler, etc)
VLF_GLOBAL = (1 << 5),
VLF_FIXED = (1 << 6),
VLF_PVPS = (1 << 7),
VLF_PREALLOC = (1 << 8)
};
inline value_flags operator |(value_flags l, value_flags r) {
return (value_flags)((unsigned)l|(unsigned)r);
}
inline value_flags operator &(value_flags l, value_flags r) {
return (value_flags)((unsigned)l&(unsigned)r);
}
inline value_flags operator ~(value_flags l) {
return (value_flags)(~(unsigned)l);
}
inline value_flags& operator |=(value_flags &l, value_flags r) {
l = l | r;
return l;
}
inline value_flags& operator &=(value_flags &l, value_flags r) {
l = l & r;
return l;
}
sb_ostream& operator << (sb_ostream &o, value &v);
typedef uint32_t value_hash;
typedef std::list< node * > uselist;
r600/sb: Fix loop optimization related hangs on eg Make sure unused ops and their references are removed, prior to entering the GCM (global code motion) pass, to stop GCM from breaking the loop logic and thus hanging the GPU. Turns out, that sb has problems with loops and node optimizations regarding associative folding: - the global code motion (gcm) pass moves ops up a loop level/basic block until they've fulfilled their total usage count - if there are ops folded into others, the usage count won't be fulfilled and thus the op moved way up to the top - within GCM the op would be visited and their deps would be moved alongside it, to fulfill the src constaints - in a loop, an unused op is moved out of the loop and GCM would move the src value ops up as well - now here arises the problem: if the loop counter is one of the src values it would get moved up as well, the loop break condition would never get hit and the shader turn into an endless loop, resulting in the GPU hanging and being reset A reduced (albeit nonsense) piglit example would be: [require] GLSL >= 1.20 [fragment shader] uniform int SIZE; uniform vec4 lights[512]; void main() { float x = 0; for(int i = 0; i < SIZE; i++) x += lights[2*i+1].x; } [test] uniform int SIZE 1 draw rect -1 -1 2 2 Which gets optimized to: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 42 dw ===== 1 gprs ===== 2 stack ========================================= ALU 3 @24 1 y: MOV R0.y, 0 t: MULLO_UINT R0.w, [0x00000002 2.8026e-45].x, R0.z LOOP_START_DX10 @22 PUSH @6 ALU 1 @30 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 2 @32 3 x: ADD_INT R0.x, R0.w, [0x00000002 2.8026e-45].x TEX 1 @36 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 1 @40 4 y: ADD R0.y, R0.y, R0.x LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Notice R0.z being the loop counter/break condition relevant register and being never incremented at all. Also some of the loop content has been moved out of it, to fulfill the requirements for the one unused op. With a debug build of mesa this would produce an error like error at : PRED_SETGE_INT __, __, EM.2, R1.x.2||FP@R0.z, C0.x : operand value R1.x.2||FP@R0.z was not previously written to its gpr and the compilation would fail due to this. On a release build it gets passed to the GPU. When using this patch, the loop remains intact: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 48 dw ===== 1 gprs ===== 2 stack ========================================= ALU 2 @24 1 y: MOV R0.y, 0 z: MOV R0.z, 0 LOOP_START_DX10 @22 PUSH @6 ALU 1 @28 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 4 @30 3 t: MULLO_UINT T0.x, [0x00000002 2.8026e-45].x, R0.z 4 x: ADD_INT R0.x, T0.x, [0x00000002 2.8026e-45].x TEX 1 @40 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 2 @44 5 y: ADD R0.y, R0.y, R0.x z: ADD_INT R0.z, R0.z, 1 LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Piglit: ./piglit summary console -d results/*_gpu_noglx name: unpatched_gpu_noglx patched_gpu_noglx ---- ------------------- ----------------- pass: 18016 18021 fail: 748 743 crash: 7 7 skip: 1124 1124 timeout: 0 0 warn: 13 13 incomplete: 0 0 dmesg-warn: 0 0 dmesg-fail: 0 0 changes: 0 5 fixes: 0 5 regressions: 0 0 total: 19908 19908 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94900 Tested-by: Heiko Przybyl <lil_tux@web.de> Tested-on: Barts PRO HD6850 Signed-off-by: Heiko Przybyl <lil_tux@web.de> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
2016-11-20 13:42:28 +00:00
enum constraint_kind {
CK_SAME_REG,
CK_PACKED_BS,
CK_PHI
};
class shader;
class sb_value_pool;
struct ra_chunk;
class ra_constraint;
class value {
protected:
value(unsigned sh_id, value_kind k, sel_chan select, unsigned ver = 0)
: kind(k), flags(),
rel(), array(),
version(ver), select(select), pin_gpr(select), gpr(),
gvn_source(), ghash(),
def(), adef(), uses(), constraint(), chunk(),
literal_value(), uid(sh_id) {}
~value() { delete_uses(); }
friend class sb_value_pool;
public:
value_kind kind;
value_flags flags;
vvec mdef;
vvec muse;
value *rel;
gpr_array *array;
unsigned version;
sel_chan select;
sel_chan pin_gpr;
sel_chan gpr;
value *gvn_source;
value_hash ghash;
node *def, *adef;
r600/sb: Fix loop optimization related hangs on eg Make sure unused ops and their references are removed, prior to entering the GCM (global code motion) pass, to stop GCM from breaking the loop logic and thus hanging the GPU. Turns out, that sb has problems with loops and node optimizations regarding associative folding: - the global code motion (gcm) pass moves ops up a loop level/basic block until they've fulfilled their total usage count - if there are ops folded into others, the usage count won't be fulfilled and thus the op moved way up to the top - within GCM the op would be visited and their deps would be moved alongside it, to fulfill the src constaints - in a loop, an unused op is moved out of the loop and GCM would move the src value ops up as well - now here arises the problem: if the loop counter is one of the src values it would get moved up as well, the loop break condition would never get hit and the shader turn into an endless loop, resulting in the GPU hanging and being reset A reduced (albeit nonsense) piglit example would be: [require] GLSL >= 1.20 [fragment shader] uniform int SIZE; uniform vec4 lights[512]; void main() { float x = 0; for(int i = 0; i < SIZE; i++) x += lights[2*i+1].x; } [test] uniform int SIZE 1 draw rect -1 -1 2 2 Which gets optimized to: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 42 dw ===== 1 gprs ===== 2 stack ========================================= ALU 3 @24 1 y: MOV R0.y, 0 t: MULLO_UINT R0.w, [0x00000002 2.8026e-45].x, R0.z LOOP_START_DX10 @22 PUSH @6 ALU 1 @30 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 2 @32 3 x: ADD_INT R0.x, R0.w, [0x00000002 2.8026e-45].x TEX 1 @36 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 1 @40 4 y: ADD R0.y, R0.y, R0.x LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Notice R0.z being the loop counter/break condition relevant register and being never incremented at all. Also some of the loop content has been moved out of it, to fulfill the requirements for the one unused op. With a debug build of mesa this would produce an error like error at : PRED_SETGE_INT __, __, EM.2, R1.x.2||FP@R0.z, C0.x : operand value R1.x.2||FP@R0.z was not previously written to its gpr and the compilation would fail due to this. On a release build it gets passed to the GPU. When using this patch, the loop remains intact: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 48 dw ===== 1 gprs ===== 2 stack ========================================= ALU 2 @24 1 y: MOV R0.y, 0 z: MOV R0.z, 0 LOOP_START_DX10 @22 PUSH @6 ALU 1 @28 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 4 @30 3 t: MULLO_UINT T0.x, [0x00000002 2.8026e-45].x, R0.z 4 x: ADD_INT R0.x, T0.x, [0x00000002 2.8026e-45].x TEX 1 @40 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 2 @44 5 y: ADD R0.y, R0.y, R0.x z: ADD_INT R0.z, R0.z, 1 LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Piglit: ./piglit summary console -d results/*_gpu_noglx name: unpatched_gpu_noglx patched_gpu_noglx ---- ------------------- ----------------- pass: 18016 18021 fail: 748 743 crash: 7 7 skip: 1124 1124 timeout: 0 0 warn: 13 13 incomplete: 0 0 dmesg-warn: 0 0 dmesg-fail: 0 0 changes: 0 5 fixes: 0 5 regressions: 0 0 total: 19908 19908 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94900 Tested-by: Heiko Przybyl <lil_tux@web.de> Tested-on: Barts PRO HD6850 Signed-off-by: Heiko Przybyl <lil_tux@web.de> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
2016-11-20 13:42:28 +00:00
uselist uses;
ra_constraint *constraint;
ra_chunk *chunk;
literal literal_value;
bool is_const() { return kind == VLK_CONST || kind == VLK_UNDEF; }
bool is_AR() {
return is_special_reg() && select == sel_chan(SV_AR_INDEX, 0);
}
bool is_geometry_emit() {
return is_special_reg() && select == sel_chan(SV_GEOMETRY_EMIT, 0);
}
bool is_lds_access() {
return is_special_reg() && select == sel_chan(SV_LDS_RW, 0);
}
bool is_lds_oq() {
return is_special_reg() && (select == sel_chan(SV_LDS_OQA, 0) || select == sel_chan(SV_LDS_OQB, 0));
}
node* any_def() {
assert(!(def && adef));
return def ? def : adef;
}
value* gvalue() {
value *v = this;
while (v->gvn_source && v != v->gvn_source)
// FIXME we really shouldn't have such chains
v = v->gvn_source;
return v;
}
bool is_scratch() {
return is_special_reg() && select == sel_chan(SV_SCRATCH, 0);
}
bool is_float_0_or_1() {
value *v = gvalue();
return v->is_const() && (v->literal_value == literal(0)
|| v->literal_value == literal(1.0f));
}
bool is_undef() { return gvalue()->kind == VLK_UNDEF; }
bool is_any_gpr() {
return (kind == VLK_REG || kind == VLK_TEMP);
}
bool is_agpr() {
return array && is_any_gpr();
}
// scalar gpr, as opposed to element of gpr array
bool is_sgpr() {
return !array && is_any_gpr();
}
bool is_special_reg() { return kind == VLK_SPECIAL_REG; }
bool is_any_reg() { return is_any_gpr() || is_special_reg(); }
bool is_kcache() { return kind == VLK_KCACHE; }
bool is_rel() { return kind == VLK_REL_REG; }
bool is_readonly() { return flags & VLF_READONLY; }
bool is_chan_pinned() { return flags & VLF_PIN_CHAN; }
bool is_reg_pinned() { return flags & VLF_PIN_REG; }
bool is_global();
void set_global();
void set_prealloc();
bool is_prealloc();
bool is_fixed();
void fix();
bool is_dead() { return flags & VLF_DEAD; }
literal & get_const_value() {
value *v = gvalue();
assert(v->is_const());
return v->literal_value;
}
// true if needs to be encoded as literal in alu
bool is_literal() {
return is_const()
&& literal_value != literal(0)
&& literal_value != literal(1)
&& literal_value != literal(-1)
&& literal_value != literal(0.5)
&& literal_value != literal(1.0);
}
void add_use(node *n);
r600/sb: Fix loop optimization related hangs on eg Make sure unused ops and their references are removed, prior to entering the GCM (global code motion) pass, to stop GCM from breaking the loop logic and thus hanging the GPU. Turns out, that sb has problems with loops and node optimizations regarding associative folding: - the global code motion (gcm) pass moves ops up a loop level/basic block until they've fulfilled their total usage count - if there are ops folded into others, the usage count won't be fulfilled and thus the op moved way up to the top - within GCM the op would be visited and their deps would be moved alongside it, to fulfill the src constaints - in a loop, an unused op is moved out of the loop and GCM would move the src value ops up as well - now here arises the problem: if the loop counter is one of the src values it would get moved up as well, the loop break condition would never get hit and the shader turn into an endless loop, resulting in the GPU hanging and being reset A reduced (albeit nonsense) piglit example would be: [require] GLSL >= 1.20 [fragment shader] uniform int SIZE; uniform vec4 lights[512]; void main() { float x = 0; for(int i = 0; i < SIZE; i++) x += lights[2*i+1].x; } [test] uniform int SIZE 1 draw rect -1 -1 2 2 Which gets optimized to: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 42 dw ===== 1 gprs ===== 2 stack ========================================= ALU 3 @24 1 y: MOV R0.y, 0 t: MULLO_UINT R0.w, [0x00000002 2.8026e-45].x, R0.z LOOP_START_DX10 @22 PUSH @6 ALU 1 @30 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 2 @32 3 x: ADD_INT R0.x, R0.w, [0x00000002 2.8026e-45].x TEX 1 @36 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 1 @40 4 y: ADD R0.y, R0.y, R0.x LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Notice R0.z being the loop counter/break condition relevant register and being never incremented at all. Also some of the loop content has been moved out of it, to fulfill the requirements for the one unused op. With a debug build of mesa this would produce an error like error at : PRED_SETGE_INT __, __, EM.2, R1.x.2||FP@R0.z, C0.x : operand value R1.x.2||FP@R0.z was not previously written to its gpr and the compilation would fail due to this. On a release build it gets passed to the GPU. When using this patch, the loop remains intact: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 48 dw ===== 1 gprs ===== 2 stack ========================================= ALU 2 @24 1 y: MOV R0.y, 0 z: MOV R0.z, 0 LOOP_START_DX10 @22 PUSH @6 ALU 1 @28 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 4 @30 3 t: MULLO_UINT T0.x, [0x00000002 2.8026e-45].x, R0.z 4 x: ADD_INT R0.x, T0.x, [0x00000002 2.8026e-45].x TEX 1 @40 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 2 @44 5 y: ADD R0.y, R0.y, R0.x z: ADD_INT R0.z, R0.z, 1 LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Piglit: ./piglit summary console -d results/*_gpu_noglx name: unpatched_gpu_noglx patched_gpu_noglx ---- ------------------- ----------------- pass: 18016 18021 fail: 748 743 crash: 7 7 skip: 1124 1124 timeout: 0 0 warn: 13 13 incomplete: 0 0 dmesg-warn: 0 0 dmesg-fail: 0 0 changes: 0 5 fixes: 0 5 regressions: 0 0 total: 19908 19908 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94900 Tested-by: Heiko Przybyl <lil_tux@web.de> Tested-on: Barts PRO HD6850 Signed-off-by: Heiko Przybyl <lil_tux@web.de> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
2016-11-20 13:42:28 +00:00
void remove_use(const node *n);
value_hash hash();
value_hash rel_hash();
void assign_source(value *v) {
assert(!gvn_source || gvn_source == this);
gvn_source = v->gvalue();
}
bool v_equal(value *v) { return gvalue() == v->gvalue(); }
unsigned use_count();
void delete_uses();
sel_chan get_final_gpr() {
if (array && array->gpr) {
int reg_offset = select.sel() - array->base_gpr.sel();
if (rel && rel->is_const())
reg_offset += rel->get_const_value().i;
return array->gpr + (reg_offset << 2);
} else {
return gpr;
}
}
unsigned get_final_chan() {
if (array) {
assert(array->gpr);
return array->gpr.chan();
} else {
assert(gpr);
return gpr.chan();
}
}
/* Check whether copy-propagation of src into this would create an access
* conflict with relative addressing, i.e. an operation that tries to access
* array elements with different address register values.
*/
bool no_reladdr_conflict_with(value *src);
val_set interferences;
unsigned uid;
};
class expr_handler;
class value_table {
typedef std::vector<value*> vt_item;
typedef std::vector<vt_item> vt_table;
expr_handler &ex;
unsigned size_bits;
unsigned size;
unsigned size_mask;
vt_table hashtable;
unsigned cnt;
public:
value_table(expr_handler &ex, unsigned size_bits = 10)
: ex(ex), size_bits(size_bits), size(1u << size_bits),
size_mask(size - 1), hashtable(size), cnt() {}
~value_table() {}
void add_value(value* v);
bool expr_equal(value* l, value* r);
unsigned count() { return cnt; }
void get_values(vvec & v);
};
class sb_context;
enum node_type {
NT_UNKNOWN,
NT_LIST,
NT_OP,
NT_REGION,
NT_REPEAT,
NT_DEPART,
NT_IF,
};
enum node_subtype {
NST_UNKNOWN,
NST_LIST,
NST_ALU_GROUP,
NST_ALU_CLAUSE,
NST_ALU_INST,
NST_ALU_PACKED_INST,
NST_CF_INST,
NST_FETCH_INST,
NST_TEX_CLAUSE,
NST_VTX_CLAUSE,
NST_GDS_CLAUSE,
NST_BB,
NST_PHI,
NST_PSI,
NST_COPY,
NST_LOOP_PHI_CONTAINER,
NST_LOOP_CONTINUE,
NST_LOOP_BREAK
};
enum node_flags {
NF_EMPTY = 0,
NF_DEAD = (1 << 0),
NF_REG_CONSTRAINT = (1 << 1),
NF_CHAN_CONSTRAINT = (1 << 2),
NF_ALU_4SLOT = (1 << 3),
NF_CONTAINER = (1 << 4),
NF_COPY_MOV = (1 << 5),
NF_DONT_KILL = (1 << 6),
NF_DONT_HOIST = (1 << 7),
NF_DONT_MOVE = (1 << 8),
// for KILLxx - we want to schedule them as early as possible
NF_SCHEDULE_EARLY = (1 << 9),
// for ALU_PUSH_BEFORE - when set, replace with PUSH + ALU
NF_ALU_STACK_WORKAROUND = (1 << 10),
NF_ALU_2SLOT = (1 << 11),
};
inline node_flags operator |(node_flags l, node_flags r) {
return (node_flags)((unsigned)l|(unsigned)r);
}
inline node_flags& operator |=(node_flags &l, node_flags r) {
l = l | r;
return l;
}
inline node_flags& operator &=(node_flags &l, node_flags r) {
l = (node_flags)((unsigned)l & (unsigned)r);
return l;
}
inline node_flags operator ~(node_flags r) {
return (node_flags)~(unsigned)r;
}
struct node_stats {
unsigned alu_count;
unsigned alu_kill_count;
unsigned alu_copy_mov_count;
unsigned cf_count;
unsigned fetch_count;
unsigned region_count;
unsigned loop_count;
unsigned phi_count;
unsigned loop_phi_count;
unsigned depart_count;
unsigned repeat_count;
unsigned if_count;
bool uses_ar;
node_stats() : alu_count(), alu_kill_count(), alu_copy_mov_count(),
cf_count(), fetch_count(), region_count(),
loop_count(), phi_count(), loop_phi_count(), depart_count(),
repeat_count(), if_count(), uses_ar(false) {}
void dump();
};
class shader;
class vpass;
class container_node;
class region_node;
class node {
protected:
node(node_type nt, node_subtype nst, node_flags flags = NF_EMPTY)
: prev(), next(), parent(),
type(nt), subtype(nst), flags(flags),
pred(), dst(), src() {}
virtual ~node() {};
public:
node *prev, *next;
container_node *parent;
node_type type;
node_subtype subtype;
node_flags flags;
value *pred;
vvec dst;
vvec src;
virtual bool is_valid() { return true; }
virtual bool accept(vpass &p, bool enter);
void insert_before(node *n);
void insert_after(node *n);
void replace_with(node *n);
void remove();
r600/sb: Fix loop optimization related hangs on eg Make sure unused ops and their references are removed, prior to entering the GCM (global code motion) pass, to stop GCM from breaking the loop logic and thus hanging the GPU. Turns out, that sb has problems with loops and node optimizations regarding associative folding: - the global code motion (gcm) pass moves ops up a loop level/basic block until they've fulfilled their total usage count - if there are ops folded into others, the usage count won't be fulfilled and thus the op moved way up to the top - within GCM the op would be visited and their deps would be moved alongside it, to fulfill the src constaints - in a loop, an unused op is moved out of the loop and GCM would move the src value ops up as well - now here arises the problem: if the loop counter is one of the src values it would get moved up as well, the loop break condition would never get hit and the shader turn into an endless loop, resulting in the GPU hanging and being reset A reduced (albeit nonsense) piglit example would be: [require] GLSL >= 1.20 [fragment shader] uniform int SIZE; uniform vec4 lights[512]; void main() { float x = 0; for(int i = 0; i < SIZE; i++) x += lights[2*i+1].x; } [test] uniform int SIZE 1 draw rect -1 -1 2 2 Which gets optimized to: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 42 dw ===== 1 gprs ===== 2 stack ========================================= ALU 3 @24 1 y: MOV R0.y, 0 t: MULLO_UINT R0.w, [0x00000002 2.8026e-45].x, R0.z LOOP_START_DX10 @22 PUSH @6 ALU 1 @30 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 2 @32 3 x: ADD_INT R0.x, R0.w, [0x00000002 2.8026e-45].x TEX 1 @36 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 1 @40 4 y: ADD R0.y, R0.y, R0.x LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Notice R0.z being the loop counter/break condition relevant register and being never incremented at all. Also some of the loop content has been moved out of it, to fulfill the requirements for the one unused op. With a debug build of mesa this would produce an error like error at : PRED_SETGE_INT __, __, EM.2, R1.x.2||FP@R0.z, C0.x : operand value R1.x.2||FP@R0.z was not previously written to its gpr and the compilation would fail due to this. On a release build it gets passed to the GPU. When using this patch, the loop remains intact: ===== SHADER #12 OPT ================================== PS/BARTS/EVERGREEN ===== ===== 48 dw ===== 1 gprs ===== 2 stack ========================================= ALU 2 @24 1 y: MOV R0.y, 0 z: MOV R0.z, 0 LOOP_START_DX10 @22 PUSH @6 ALU 1 @28 KC0[CB0:0-15] 2 M x: PRED_SETGE_INT __.x, R0.z, KC0[0].x JUMP @14 POP:1 LOOP_BREAK @20 POP @14 POP:1 ALU 4 @30 3 t: MULLO_UINT T0.x, [0x00000002 2.8026e-45].x, R0.z 4 x: ADD_INT R0.x, T0.x, [0x00000002 2.8026e-45].x TEX 1 @40 VFETCH R0.x___, R0.x, RID:0 MFC:16 UCF:0 FMT[..] ALU 2 @44 5 y: ADD R0.y, R0.y, R0.x z: ADD_INT R0.z, R0.z, 1 LOOP_END @4 EXPORT_DONE PIXEL 0 R0.____ EOP ===== SHADER_END =============================================================== Piglit: ./piglit summary console -d results/*_gpu_noglx name: unpatched_gpu_noglx patched_gpu_noglx ---- ------------------- ----------------- pass: 18016 18021 fail: 748 743 crash: 7 7 skip: 1124 1124 timeout: 0 0 warn: 13 13 incomplete: 0 0 dmesg-warn: 0 0 dmesg-fail: 0 0 changes: 0 5 fixes: 0 5 regressions: 0 0 total: 19908 19908 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94900 Tested-by: Heiko Przybyl <lil_tux@web.de> Tested-on: Barts PRO HD6850 Signed-off-by: Heiko Przybyl <lil_tux@web.de> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
2016-11-20 13:42:28 +00:00
virtual value_hash hash() const;
value_hash hash_src() const;
virtual bool fold_dispatch(expr_handler *ex);
bool is_container() { return flags & NF_CONTAINER; }
bool is_alu_packed() { return subtype == NST_ALU_PACKED_INST; }
bool is_alu_inst() { return subtype == NST_ALU_INST; }
bool is_alu_group() { return subtype == NST_ALU_GROUP; }
bool is_alu_clause() { return subtype == NST_ALU_CLAUSE; }
bool is_fetch_clause() {
return subtype == NST_TEX_CLAUSE || subtype == NST_VTX_CLAUSE || subtype == NST_GDS_CLAUSE;
}
bool is_copy() { return subtype == NST_COPY; }
bool is_copy_mov() { return flags & NF_COPY_MOV; }
bool is_any_alu() { return is_alu_inst() || is_alu_packed() || is_copy(); }
bool is_fetch_inst() { return subtype == NST_FETCH_INST; }
bool is_cf_inst() { return subtype == NST_CF_INST; }
bool is_region() { return type == NT_REGION; }
bool is_depart() { return type == NT_DEPART; }
bool is_repeat() { return type == NT_REPEAT; }
bool is_if() { return type == NT_IF; }
bool is_bb() { return subtype == NST_BB; }
bool is_phi() { return subtype == NST_PHI; }
bool is_dead() { return flags & NF_DEAD; }
bool is_cf_op(unsigned op);
bool is_alu_op(unsigned op);
bool is_fetch_op(unsigned op);
unsigned cf_op_flags();
unsigned alu_op_flags();
unsigned alu_op_slot_flags();
unsigned fetch_op_flags();
bool is_mova();
bool is_pred_set();
bool vec_uses_ar(vvec &vv) {
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
value *v = *I;
if (v && v->rel && !v->rel->is_const())
return true;
}
return false;
}
bool uses_ar() {
return vec_uses_ar(dst) || vec_uses_ar(src);
}
bool vec_uses_lds_oq(vvec &vv) {
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
value *v = *I;
if (v && v->is_lds_oq())
return true;
}
return false;
}
bool consumes_lds_oq() {
return vec_uses_lds_oq(src);
}
bool produces_lds_oq() {
return vec_uses_lds_oq(dst);
}
region_node* get_parent_region();
friend class shader;
};
class container_node : public node {
public:
container_node(node_type nt = NT_LIST, node_subtype nst = NST_LIST,
node_flags flags = NF_EMPTY)
: node(nt, nst, flags | NF_CONTAINER), first(), last(),
live_after(), live_before() {}
// child items list
node *first, *last;
val_set live_after;
val_set live_before;
class iterator {
node *p;
public:
iterator(node *pp = NULL) : p(pp) {}
iterator & operator ++() { p = p->next; return *this;}
iterator & operator --() { p = p->prev; return *this;}
node* operator *() { return p; }
node* operator ->() { return p; }
const iterator advance(int n) {
if (!n) return *this;
iterator I(p);
if (n > 0) while (n--) ++I;
else while (n++) --I;
return I;
}
const iterator operator +(int n) { return advance(n); }
const iterator operator -(int n) { return advance(-n); }
bool operator !=(const iterator &i) { return p != i.p; }
bool operator ==(const iterator &i) { return p == i.p; }
};
class riterator {
iterator i;
public:
riterator(node *p = NULL) : i(p) {}
riterator & operator ++() { --i; return *this;}
riterator & operator --() { ++i; return *this;}
node* operator *() { return *i; }
node* operator ->() { return *i; }
bool operator !=(const riterator &r) { return i != r.i; }
bool operator ==(const riterator &r) { return i == r.i; }
};
iterator begin() { return first; }
iterator end() { return NULL; }
riterator rbegin() { return last; }
riterator rend() { return NULL; }
bool empty() { assert(first != NULL || first == last); return !first; }
unsigned count();
// used with node containers that represent scheduling queues
// ignores copies and takes into account alu_packed_node items
unsigned real_alu_count();
void push_back(node *n);
void push_front(node *n);
void insert_node_before(node *s, node *n);
void insert_node_after(node *s, node *n);
void append_from(container_node *c);
// remove range [b..e) from some container and assign to this container
void move(iterator b, iterator e);
void expand();
void expand(container_node *n);
void remove_node(node *n);
node *cut(iterator b, iterator e);
void clear() { first = last = NULL; }
virtual bool is_valid() { return true; }
virtual bool accept(vpass &p, bool enter);
virtual bool fold_dispatch(expr_handler *ex);
node* front() { return first; }
node* back() { return last; }
void collect_stats(node_stats &s);
friend class shader;
};
typedef container_node::iterator node_iterator;
typedef container_node::riterator node_riterator;
class alu_group_node : public container_node {
protected:
alu_group_node() : container_node(NT_LIST, NST_ALU_GROUP), literals() {}
public:
std::vector<literal> literals;
virtual bool is_valid() { return subtype == NST_ALU_GROUP; }
virtual bool accept(vpass &p, bool enter);
unsigned literal_chan(literal l) {
std::vector<literal>::iterator F =
std::find(literals.begin(), literals.end(), l);
assert(F != literals.end());
return F - literals.begin();
}
friend class shader;
};
class cf_node : public container_node {
protected:
cf_node() : container_node(NT_OP, NST_CF_INST), jump_target(),
jump_after_target() { memset(&bc, 0, sizeof(bc_cf)); };
public:
bc_cf bc;
cf_node *jump_target;
bool jump_after_target;
virtual bool is_valid() { return subtype == NST_CF_INST; }
virtual bool accept(vpass &p, bool enter);
virtual bool fold_dispatch(expr_handler *ex);
void jump(cf_node *c) { jump_target = c; jump_after_target = false; }
void jump_after(cf_node *c) { jump_target = c; jump_after_target = true; }
friend class shader;
};
class alu_node : public node {
protected:
alu_node() : node(NT_OP, NST_ALU_INST) { }
public:
bc_alu bc;
virtual bool is_valid() { return subtype == NST_ALU_INST; }
virtual bool accept(vpass &p, bool enter);
virtual bool fold_dispatch(expr_handler *ex);
unsigned forced_bank_swizzle() {
return ((bc.op_ptr->flags & AF_INTERP) &&
((bc.slot_flags == AF_4V) ||
(bc.slot_flags == AF_2V))) ? VEC_210 : 0;
}
// return param index + 1 if instruction references interpolation param,
// otherwise 0
unsigned interp_param();
alu_group_node *get_alu_group_node();
friend class shader;
};
// for multi-slot instrs - DOT/INTERP/... (maybe useful for 64bit pairs later)
class alu_packed_node : public container_node {
protected:
alu_packed_node() : container_node(NT_OP, NST_ALU_PACKED_INST) {}
public:
const alu_op_info* op_ptr() {
return static_cast<alu_node*>(first)->bc.op_ptr;
}
unsigned op() { return static_cast<alu_node*>(first)->bc.op; }
void init_args(bool repl);
virtual bool is_valid() { return subtype == NST_ALU_PACKED_INST; }
virtual bool accept(vpass &p, bool enter);
virtual bool fold_dispatch(expr_handler *ex);
unsigned get_slot_mask();
void update_packed_items(sb_context &ctx);
friend class shader;
};
class fetch_node : public node {
protected:
fetch_node() : node(NT_OP, NST_FETCH_INST) { memset(&bc, 0, sizeof(bc_fetch)); };
public:
bc_fetch bc;
virtual bool is_valid() { return subtype == NST_FETCH_INST; }
virtual bool accept(vpass &p, bool enter);
virtual bool fold_dispatch(expr_handler *ex);
bool uses_grad() { return bc.op_ptr->flags & FF_USEGRAD; }
friend class shader;
};
class region_node;
class repeat_node : public container_node {
protected:
repeat_node(region_node *target, unsigned id)
: container_node(NT_REPEAT, NST_LIST), target(target), rep_id(id) {}
public:
region_node *target;
unsigned rep_id;
virtual bool accept(vpass &p, bool enter);
friend class shader;
};
class depart_node : public container_node {
protected:
depart_node(region_node *target, unsigned id)
: container_node(NT_DEPART, NST_LIST), target(target), dep_id(id) {}
public:
region_node *target;
unsigned dep_id;
virtual bool accept(vpass &p, bool enter);
friend class shader;
};
class if_node : public container_node {
protected:
if_node() : container_node(NT_IF, NST_LIST), cond() {};
public:
value *cond; // glued to pseudo output (dst[2]) of the PRED_SETxxx
virtual bool accept(vpass &p, bool enter);
friend class shader;
};
typedef std::vector<depart_node*> depart_vec;
typedef std::vector<repeat_node*> repeat_vec;
class region_node : public container_node {
protected:
region_node(unsigned id) : container_node(NT_REGION, NST_LIST), region_id(id),
loop_phi(), phi(), vars_defined(), departs(), repeats(), src_loop()
{}
public:
unsigned region_id;
container_node *loop_phi;
container_node *phi;
val_set vars_defined;
depart_vec departs;
repeat_vec repeats;
// true if region was created for loop in the parser, sometimes repeat_node
// may be optimized away so we need to remember this information
bool src_loop;
virtual bool accept(vpass &p, bool enter);
unsigned dep_count() { return departs.size(); }
unsigned rep_count() { return repeats.size() + 1; }
bool is_loop() { return src_loop || !repeats.empty(); }
container_node* get_entry_code_location() {
node *p = first;
while (p && (p->is_depart() || p->is_repeat()))
p = static_cast<container_node*>(p)->first;
container_node *c = static_cast<container_node*>(p);
if (c->is_bb())
return c;
else
return c->parent;
}
void expand_depart(depart_node *d);
void expand_repeat(repeat_node *r);
friend class shader;
};
class bb_node : public container_node {
protected:
bb_node(unsigned id, unsigned loop_level)
: container_node(NT_LIST, NST_BB), id(id), loop_level(loop_level) {}
public:
unsigned id;
unsigned loop_level;
virtual bool accept(vpass &p, bool enter);
friend class shader;
};
typedef std::vector<region_node*> regions_vec;
typedef std::vector<bb_node*> bbs_vec;
typedef std::list<node*> sched_queue;
typedef sched_queue::iterator sq_iterator;
typedef std::vector<node*> node_vec;
typedef std::list<node*> node_list;
typedef std::set<node*> node_set;
} // namespace r600_sb
#endif /* R600_SB_IR_H_ */