aco: use non-sequential addressing
fossil-db (GFX10.3): Totals from 70493 (50.57% of 139391) affected shaders: SGPRs: 4232624 -> 4231808 (-0.02%); split: -0.09%, +0.07% VGPRs: 2831772 -> 2764740 (-2.37%); split: -2.53%, +0.17% CodeSize: 225584412 -> 225048740 (-0.24%); split: -0.44%, +0.21% MaxWaves: 875319 -> 878837 (+0.40%); split: +0.44%, -0.04% Instrs: 43157803 -> 42496421 (-1.53%); split: -1.54%, +0.01% Cycles: 1656380132 -> 1641532056 (-0.90%); split: -0.94%, +0.04% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8523>
This commit is contained in:
parent
faf3e9a27f
commit
c353895c92
|
@ -428,6 +428,15 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case Format::MIMG: {
|
case Format::MIMG: {
|
||||||
|
unsigned use_nsa = false;
|
||||||
|
unsigned addr_dwords = instr->operands.size() - 3;
|
||||||
|
for (unsigned i = 1; i < addr_dwords; i++) {
|
||||||
|
if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
|
||||||
|
use_nsa = true;
|
||||||
|
}
|
||||||
|
assert(!use_nsa || ctx.chip_class >= GFX10);
|
||||||
|
unsigned nsa_dwords = use_nsa ? DIV_ROUND_UP(addr_dwords - 1, 4) : 0;
|
||||||
|
|
||||||
MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
|
MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
|
||||||
uint32_t encoding = (0b111100 << 26);
|
uint32_t encoding = (0b111100 << 26);
|
||||||
encoding |= mimg->slc ? 1 << 25 : 0;
|
encoding |= mimg->slc ? 1 << 25 : 0;
|
||||||
|
@ -443,6 +452,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
||||||
encoding |= mimg->da ? 1 << 14 : 0;
|
encoding |= mimg->da ? 1 << 14 : 0;
|
||||||
} else {
|
} else {
|
||||||
encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
|
encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
|
||||||
|
encoding |= nsa_dwords << 1;
|
||||||
encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */
|
encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */
|
||||||
encoding |= mimg->dlc ? 1 << 7 : 0;
|
encoding |= mimg->dlc ? 1 << 7 : 0;
|
||||||
}
|
}
|
||||||
|
@ -465,6 +475,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
||||||
}
|
}
|
||||||
|
|
||||||
out.push_back(encoding);
|
out.push_back(encoding);
|
||||||
|
|
||||||
|
if (nsa_dwords) {
|
||||||
|
out.resize(out.size() + nsa_dwords);
|
||||||
|
std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords);
|
||||||
|
for (unsigned i = 0; i < addr_dwords - 1; i++)
|
||||||
|
nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case Format::FLAT:
|
case Format::FLAT:
|
||||||
|
|
|
@ -5841,10 +5841,11 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
|
||||||
Definition dst,
|
Definition dst,
|
||||||
Temp rsrc,
|
Temp rsrc,
|
||||||
Operand samp,
|
Operand samp,
|
||||||
const std::vector<Temp>& coords,
|
std::vector<Temp> coords,
|
||||||
unsigned num_wqm_coords=0,
|
unsigned num_wqm_coords=0,
|
||||||
Operand vdata=Operand(v1))
|
Operand vdata=Operand(v1))
|
||||||
{
|
{
|
||||||
|
if (bld.program->chip_class < GFX10) {
|
||||||
Temp coord = coords[0];
|
Temp coord = coords[0];
|
||||||
if (coords.size() > 1) {
|
if (coords.size() > 1) {
|
||||||
coord = bld.tmp(RegType::vgpr, coords.size());
|
coord = bld.tmp(RegType::vgpr, coords.size());
|
||||||
|
@ -5865,14 +5866,27 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
|
||||||
coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
|
coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
coords[0] = coord;
|
||||||
|
coords.resize(1);
|
||||||
|
} else {
|
||||||
|
for (unsigned i = 0; i < num_wqm_coords; i++)
|
||||||
|
coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
|
||||||
|
|
||||||
|
for (Temp& coord : coords) {
|
||||||
|
if (coord.type() == RegType::sgpr)
|
||||||
|
coord = bld.copy(bld.def(v1), coord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(
|
aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(
|
||||||
op, Format::MIMG, 4, dst.isTemp())};
|
op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
|
||||||
if (dst.isTemp())
|
if (dst.isTemp())
|
||||||
mimg->definitions[0] = dst;
|
mimg->definitions[0] = dst;
|
||||||
mimg->operands[0] = Operand(rsrc);
|
mimg->operands[0] = Operand(rsrc);
|
||||||
mimg->operands[1] = samp;
|
mimg->operands[1] = samp;
|
||||||
mimg->operands[2] = vdata;
|
mimg->operands[2] = vdata;
|
||||||
mimg->operands[3] = Operand(coord);
|
for (unsigned i = 0; i < coords.size(); i++)
|
||||||
|
mimg->operands[3 + i] = Operand(coords[i]);
|
||||||
|
|
||||||
MIMG_instruction *res = mimg.get();
|
MIMG_instruction *res = mimg.get();
|
||||||
bld.insert(std::move(mimg));
|
bld.insert(std::move(mimg));
|
||||||
|
|
|
@ -436,7 +436,7 @@ bool validate_ir(Program* program)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case Format::MIMG: {
|
case Format::MIMG: {
|
||||||
check(instr->operands.size() == 4, "MIMG instructions must have 4 operands", instr.get());
|
check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
|
||||||
check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
|
check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
|
||||||
"MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
|
"MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
|
||||||
if (instr->operands[1].hasRegClass())
|
if (instr->operands[1].hasRegClass())
|
||||||
|
@ -447,8 +447,15 @@ bool validate_ir(Program* program)
|
||||||
check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
|
check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
|
||||||
"MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
|
"MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
|
||||||
}
|
}
|
||||||
check(instr->operands[3].hasRegClass() && instr->operands[3].regClass().type() == RegType::vgpr,
|
check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
|
||||||
|
for (unsigned i = 3; i < instr->operands.size(); i++) {
|
||||||
|
if (instr->operands.size() == 4) {
|
||||||
|
check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
|
||||||
"MIMG operands[3] (VADDR) must be VGPR", instr.get());
|
"MIMG operands[3] (VADDR) must be VGPR", instr.get());
|
||||||
|
} else {
|
||||||
|
check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
|
check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
|
||||||
"MIMG definitions[0] (VDATA) must be VGPR", instr.get());
|
"MIMG definitions[0] (VDATA) must be VGPR", instr.get());
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -149,18 +149,18 @@ BEGIN_TEST(isel.sparse.clause)
|
||||||
};
|
};
|
||||||
void main() {
|
void main() {
|
||||||
//>> v5: (noCSE)%zero0 = p_create_vector 0, 0, 0, 0, 0
|
//>> v5: (noCSE)%zero0 = p_create_vector 0, 0, 0, 0, 0
|
||||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
//>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||||
//>> v5: (noCSE)%zero1 = p_create_vector 0, 0, 0, 0, 0
|
//>> v5: (noCSE)%zero1 = p_create_vector 0, 0, 0, 0, 0
|
||||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
//>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||||
//>> v5: (noCSE)%zero2 = p_create_vector 0, 0, 0, 0, 0
|
//>> v5: (noCSE)%zero2 = p_create_vector 0, 0, 0, 0, 0
|
||||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
//>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||||
//>> v5: (noCSE)%zero3 = p_create_vector 0, 0, 0, 0, 0
|
//>> v5: (noCSE)%zero3 = p_create_vector 0, 0, 0, 0, 0
|
||||||
//>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
//>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
|
||||||
//>> s_clause 0x3
|
//>> s_clause 0x3
|
||||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||||
//! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
//! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
|
||||||
code[0] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(1, 0), res[0]);
|
code[0] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(1, 0), res[0]);
|
||||||
code[1] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(2, 0), res[1]);
|
code[1] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(2, 0), res[1]);
|
||||||
code[2] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(3, 0), res[2]);
|
code[2] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(3, 0), res[2]);
|
||||||
|
|
Loading…
Reference in New Issue