aco: add notion of subdword registers to register allocator

To not having to split the register file into single bytes,
we maintain a map with registers which contain subdword variables.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-By: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4002>
This commit is contained in:
Daniel Schürmann 2020-04-02 18:27:50 +01:00
parent 90811554da
commit aca2bbf975
1 changed files with 75 additions and 24 deletions

View File

@ -56,6 +56,7 @@ public:
RegisterFile() {regs.fill(0);}
std::array<uint32_t, 512> regs;
std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
const uint32_t& operator [] (unsigned index) const {
return regs[index];
@ -72,10 +73,17 @@ public:
return res;
}
bool test(PhysReg start, unsigned size) {
for (unsigned i = 0; i < size; i++) {
if (regs[start + i])
bool test(PhysReg start, unsigned num_bytes) {
for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
if (regs[i] & 0x0FFFFFFF)
return true;
if (regs[i] == 0xF0000000) {
assert(subdword_regs.find(i) != subdword_regs.end());
for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
if (subdword_regs[i][j])
return true;
}
}
}
return false;
}
@ -85,24 +93,66 @@ public:
regs[start + i] = val;
}
void clear(PhysReg start, unsigned size) {
fill(start, size, 0);
void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) {
fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
/* emplace or get */
std::array<uint32_t, 4>& sub = subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
sub[j] = val;
if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
subdword_regs.erase(i);
regs[i] = 0;
}
}
}
void block(PhysReg start, unsigned num_bytes) {
if (start.byte() || num_bytes % 4)
fill_subdword(start, num_bytes, 0xFFFFFFFF);
else
fill(start, num_bytes / 4, 0xFFFFFFFF);
}
bool is_blocked(PhysReg start) {
if (regs[start] == 0xFFFFFFFF)
return true;
if (regs[start] == 0xF0000000) {
for (unsigned i = start.byte(); i < 4; i++)
if (subdword_regs[start][i] == 0xFFFFFFFF)
return true;
}
return false;
}
void clear(PhysReg start, RegClass rc) {
if (rc.is_subdword())
fill_subdword(start, rc.bytes(), 0);
else
fill(start, rc.size(), 0);
}
void fill(Operand op) {
fill(op.physReg(), op.size(), op.tempId());
if (op.regClass().is_subdword())
fill_subdword(op.physReg(), op.bytes(), op.tempId());
else
fill(op.physReg(), op.size(), op.tempId());
}
void clear(Operand op) {
fill(op.physReg(), op.size(), 0);
clear(op.physReg(), op.regClass());
}
void fill(Definition def) {
fill(def.physReg(), def.size(), def.tempId());
if (def.regClass().is_subdword())
fill_subdword(def.physReg(), def.bytes(), def.tempId());
else
fill(def.physReg(), def.size(), def.tempId());
}
void clear(Definition def) {
fill(def.physReg(), def.size(), 0);
clear(def.physReg(), def.regClass());
}
};
@ -212,8 +262,8 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
// FIXME: if a definition got moved, change the target location and remove the parallelcopy
copy.second.setTemp(Temp(ctx.program->allocateId(), copy.second.regClass()));
ctx.assignments[copy.second.tempId()] = {copy.second.physReg(), copy.second.regClass()};
for (unsigned i = copy.second.physReg().reg(); i < copy.second.physReg() + copy.second.size(); i++)
reg_file[i] = copy.second.tempId();
reg_file.fill(copy.second);
/* check if we moved an operand */
for (Operand& op : instr->operands) {
if (!op.isTemp())
@ -365,7 +415,8 @@ bool get_regs_for_copies(ra_ctx& ctx,
if (res.second) {
/* mark the area as blocked */
reg_file.fill(res.first, size, 0xFFFFFFFF);
reg_file.block(res.first, var.second.bytes());
/* create parallelcopy pair (without definition id) */
Temp tmp = Temp(id, var.second);
Operand pc_op = Operand(tmp);
@ -397,8 +448,7 @@ bool get_regs_for_copies(ra_ctx& ctx,
if (reg_file[j] == 0 || reg_file[j] == last_var)
continue;
/* 0xFFFF signals that this area is already blocked! */
if (reg_file[j] == 0xFFFFFFFF || k > num_moves) {
if (reg_file.is_blocked(PhysReg{j}) || k > num_moves) {
found = false;
break;
}
@ -449,12 +499,12 @@ bool get_regs_for_copies(ra_ctx& ctx,
unsigned size = ctx.assignments[reg_file[j]].second.size();
unsigned id = reg_file[j];
new_vars.emplace(size, id);
reg_file.clear(ctx.assignments[id].first, size);
reg_file.clear(ctx.assignments[id].first, ctx.assignments[id].second);
}
}
/* mark the area as blocked */
reg_file.fill(PhysReg{reg_lo}, size, 0xFFFFFFFF);
reg_file.block(PhysReg{reg_lo}, size * 4);
if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
return false;
@ -492,8 +542,8 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
instr->operands[j].physReg() >= lb &&
instr->operands[j].physReg() < ub) {
assert(instr->operands[j].isFixed());
assert(reg_file[instr->operands[j].physReg()] == 0);
reg_file.fill(instr->operands[j].physReg(), instr->operands[j].size(), 0xFFFFFFFF);
assert(!reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes()));
reg_file.block(instr->operands[j].physReg(), instr->operands[j].bytes());
killed_ops += instr->operands[j].getTemp().size();
}
}
@ -533,7 +583,7 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
continue;
/* dead operands effectively reduce the number of estimated moves */
if (remaining_op_moves && reg_file[j] == 0xFFFFFFFF) {
if (remaining_op_moves && reg_file.is_blocked(PhysReg{j})) {
k--;
remaining_op_moves--;
continue;
@ -591,7 +641,7 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
for (unsigned j = best_pos; j < best_pos + size; j++) {
if (reg_file[j] != 0xFFFFFFFF && reg_file[j] != 0)
vars.emplace(ctx.assignments[reg_file[j]].second.size(), reg_file[j]);
reg_file[j] = 0;
reg_file.clear(ctx.assignments[reg_file[j]].first, ctx.assignments[reg_file[j]].second);
}
if (instr->opcode == aco_opcode::p_create_vector) {
@ -637,7 +687,7 @@ std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
/* we set the definition regs == 0. the actual caller is responsible for correct setting */
reg_file.clear(PhysReg{best_pos}, size);
reg_file.clear(PhysReg{best_pos}, rc);
update_renames(ctx, reg_file, parallelcopies, instr);
@ -884,8 +934,9 @@ bool get_reg_specified(ra_ctx& ctx,
if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
return false;
if (reg_file.test(reg, size))
if (reg_file.test(reg, rc.bytes()))
return false;
adjust_max_used_regs(ctx, rc, reg_lo);
return true;
}
@ -1251,7 +1302,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
continue;
assert(definition.physReg() == exec);
assert(!register_file.test(definition.physReg(), definition.size()));
assert(!register_file.test(definition.physReg(), definition.bytes()));
register_file.fill(definition);
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
}
@ -1283,7 +1334,7 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
continue;
}
/* only assign if register is still free */
if (!register_file.test(reg, definition.size())) {
if (!register_file.test(reg, definition.bytes())) {
definition.setFixed(reg);
register_file.fill(definition);
ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};