aco: use non-sequential addressing
fossil-db (GFX10.3): Totals from 70493 (50.57% of 139391) affected shaders: SGPRs: 4232624 -> 4231808 (-0.02%); split: -0.09%, +0.07% VGPRs: 2831772 -> 2764740 (-2.37%); split: -2.53%, +0.17% CodeSize: 225584412 -> 225048740 (-0.24%); split: -0.44%, +0.21% MaxWaves: 875319 -> 878837 (+0.40%); split: +0.44%, -0.04% Instrs: 43157803 -> 42496421 (-1.53%); split: -1.54%, +0.01% Cycles: 1656380132 -> 1641532056 (-0.90%); split: -0.94%, +0.04% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8523>
This commit is contained in:
parent
faf3e9a27f
commit
c353895c92
|
@ -428,6 +428,15 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
break;
|
||||
}
|
||||
case Format::MIMG: {
|
||||
unsigned use_nsa = false;
|
||||
unsigned addr_dwords = instr->operands.size() - 3;
|
||||
for (unsigned i = 1; i < addr_dwords; i++) {
|
||||
if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
|
||||
use_nsa = true;
|
||||
}
|
||||
assert(!use_nsa || ctx.chip_class >= GFX10);
|
||||
unsigned nsa_dwords = use_nsa ? DIV_ROUND_UP(addr_dwords - 1, 4) : 0;
|
||||
|
||||
MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
|
||||
uint32_t encoding = (0b111100 << 26);
|
||||
encoding |= mimg->slc ? 1 << 25 : 0;
|
||||
|
@ -443,6 +452,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
encoding |= mimg->da ? 1 << 14 : 0;
|
||||
} else {
|
||||
encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
|
||||
encoding |= nsa_dwords << 1;
|
||||
encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */
|
||||
encoding |= mimg->dlc ? 1 << 7 : 0;
|
||||
}
|
||||
|
@ -465,6 +475,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
}
|
||||
|
||||
out.push_back(encoding);
|
||||
|
||||
if (nsa_dwords) {
|
||||
out.resize(out.size() + nsa_dwords);
|
||||
std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords);
|
||||
for (unsigned i = 0; i < addr_dwords - 1; i++)
|
||||
nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Format::FLAT:
|
||||
|
|
|
@ -5841,38 +5841,52 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
|
|||
Definition dst,
|
||||
Temp rsrc,
|
||||
Operand samp,
|
||||
const std::vector<Temp>& coords,
|
||||
std::vector<Temp> coords,
|
||||
unsigned num_wqm_coords=0,
|
||||
Operand vdata=Operand(v1))
|
||||
{
|
||||
Temp coord = coords[0];
|
||||
if (coords.size() > 1) {
|
||||
coord = bld.tmp(RegType::vgpr, coords.size());
|
||||
if (bld.program->chip_class < GFX10) {
|
||||
Temp coord = coords[0];
|
||||
if (coords.size() > 1) {
|
||||
coord = bld.tmp(RegType::vgpr, coords.size());
|
||||
|
||||
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
|
||||
for (unsigned i = 0; i < coords.size(); i++)
|
||||
vec->operands[i] = Operand(coords[i]);
|
||||
vec->definitions[0] = Definition(coord);
|
||||
bld.insert(std::move(vec));
|
||||
} else if (coord.type() == RegType::sgpr) {
|
||||
coord = bld.copy(bld.def(v1), coord);
|
||||
}
|
||||
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
|
||||
for (unsigned i = 0; i < coords.size(); i++)
|
||||
vec->operands[i] = Operand(coords[i]);
|
||||
vec->definitions[0] = Definition(coord);
|
||||
bld.insert(std::move(vec));
|
||||
} else if (coord.type() == RegType::sgpr) {
|
||||
coord = bld.copy(bld.def(v1), coord);
|
||||
}
|
||||
|
||||
if (num_wqm_coords) {
|
||||
/* We don't need the bias, sample index, compare value or offset to be
|
||||
* computed in WQM but if the p_create_vector copies the coordinates, then it
|
||||
* needs to be in WQM. */
|
||||
coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
|
||||
if (num_wqm_coords) {
|
||||
/* We don't need the bias, sample index, compare value or offset to be
|
||||
* computed in WQM but if the p_create_vector copies the coordinates, then it
|
||||
* needs to be in WQM. */
|
||||
coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
|
||||
}
|
||||
|
||||
coords[0] = coord;
|
||||
coords.resize(1);
|
||||
} else {
|
||||
for (unsigned i = 0; i < num_wqm_coords; i++)
|
||||
coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
|
||||
|
||||
for (Temp& coord : coords) {
|
||||
if (coord.type() == RegType::sgpr)
|
||||
coord = bld.copy(bld.def(v1), coord);
|
||||
}
|
||||
}
|
||||
|
||||
aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(
|
||||
op, Format::MIMG, 4, dst.isTemp())};
|
||||
op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
|
||||
if (dst.isTemp())
|
||||
mimg->definitions[0] = dst;
|
||||
mimg->operands[0] = Operand(rsrc);
|
||||
mimg->operands[1] = samp;
|
||||
mimg->operands[2] = vdata;
|
||||
mimg->operands[3] = Operand(coord);
|
||||
for (unsigned i = 0; i < coords.size(); i++)
|
||||
mimg->operands[3 + i] = Operand(coords[i]);
|
||||
|
||||
MIMG_instruction *res = mimg.get();
|
||||
bld.insert(std::move(mimg));
|
||||
|
|
|
@ -436,7 +436,7 @@ bool validate_ir(Program* program)
|
|||
break;
|
||||
}
|
||||
case Format::MIMG: {
|
||||
check(instr->operands.size() == 4, "MIMG instructions must have 4 operands", instr.get());
|
||||
check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
|
||||
check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
|
||||
"MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
|
||||
if (instr->operands[1].hasRegClass())
|
||||
|
@ -447,8 +447,15 @@ bool validate_ir(Program* program)
|
|||
check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
|
||||
"MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
|
||||
}
|
||||
check(instr->operands[3].hasRegClass() && instr->operands[3].regClass().type() == RegType::vgpr,
|
||||
"MIMG operands[3] (VADDR) must be VGPR", instr.get());
|
||||
check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
|
||||
for (unsigned i = 3; i < instr->operands.size(); i++) {
|
||||
if (instr->operands.size() == 4) {
|
||||
check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
|
||||
"MIMG operands[3] (VADDR) must be VGPR", instr.get());
|
||||
} else {
|
||||
check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
|
||||
}
|
||||
}
|
||||
check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
|
||||
"MIMG definitions[0] (VDATA) must be VGPR", instr.get());
|
||||
break;
|
||||
|
|
|
@ -149,18 +149,18 @@ BEGIN_TEST(isel.sparse.clause)
|
|||
};
|
||||
void main() {
|
||||
//>> v5: (noCSE)%zero0 = p_create_vector 0, 0, 0, 0, 0
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> v5: (noCSE)%zero1 = p_create_vector 0, 0, 0, 0, 0
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> v5: (noCSE)%zero2 = p_create_vector 0, 0, 0, 0, 0
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> v5: (noCSE)%zero3 = p_create_vector 0, 0, 0, 0, 0
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||
//>> s_clause 0x3
|
||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||
code[0] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(1, 0), res[0]);
|
||||
code[1] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(2, 0), res[1]);
|
||||
code[2] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(3, 0), res[2]);
|
||||
|
|
Loading…
Reference in New Issue