radv,aco: lower image descriptor loads in NIR

fossil-db (Sienna Cichlid):
Totals from 2926 (1.80% of 162293) affected shaders:
Instrs: 2315110 -> 2306644 (-0.37%); split: -0.37%, +0.00%
CodeSize: 12581592 -> 12546588 (-0.28%); split: -0.28%, +0.00%
VGPRs: 130216 -> 130208 (-0.01%)
SpillSGPRs: 477 -> 474 (-0.63%); split: -5.03%, +4.40%
Latency: 29686188 -> 29678804 (-0.02%); split: -0.05%, +0.02%
InvThroughput: 6926545 -> 6926286 (-0.00%); split: -0.02%, +0.02%
SClause: 73761 -> 72996 (-1.04%); split: -1.16%, +0.12%
Copies: 144068 -> 137279 (-4.71%); split: -4.78%, +0.07%
Branches: 47466 -> 47483 (+0.04%); split: -0.01%, +0.04%
PreSGPRs: 118042 -> 117377 (-0.56%); split: -1.34%, +0.77%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12773>
This commit is contained in:
Rhys Perry 2020-10-21 18:12:35 +01:00 committed by Marge Bot
parent 15640e58d9
commit 543a5487cf
5 changed files with 119 additions and 291 deletions

View File

@ -5388,21 +5388,6 @@ visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
emit_split_vector(ctx, tess_coord, 3);
}
Temp
load_desc_ptr(isel_context* ctx, unsigned desc_set)
{
const struct radv_userdata_locations *user_sgprs_locs = &ctx->program->info->user_sgprs_locs;
if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
Builder bld(ctx->program, ctx->block);
Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
}
return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
}
void
load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
@ -5581,16 +5566,6 @@ visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
}
enum aco_descriptor_type {
ACO_DESC_IMAGE,
ACO_DESC_FMASK,
ACO_DESC_SAMPLER,
ACO_DESC_BUFFER,
ACO_DESC_PLANE_0,
ACO_DESC_PLANE_1,
ACO_DESC_PLANE_2,
};
static bool
should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
{
@ -5601,189 +5576,6 @@ should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool
dim == ac_image_2darraymsaa;
}
Temp
get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
{
/* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
*/
Temp index = Temp();
bool index_set = false;
unsigned constant_index = 0;
unsigned descriptor_set;
unsigned base_index;
Builder bld(ctx->program, ctx->block);
if (!deref_instr) {
assert(tex_instr);
descriptor_set = 0;
base_index = tex_instr->sampler_index;
} else {
while (deref_instr->deref_type != nir_deref_type_var) {
unsigned array_size = glsl_get_aoa_size(deref_instr->type);
if (!array_size)
array_size = 1;
assert(deref_instr->deref_type == nir_deref_type_array);
nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
if (const_value) {
constant_index += array_size * const_value->u32;
} else {
Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
if (indirect.type() == RegType::vgpr)
indirect = bld.as_uniform(indirect);
if (array_size != 1)
indirect =
bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);
if (!index_set) {
index = indirect;
index_set = true;
} else {
index =
bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
}
}
deref_instr = nir_src_as_deref(deref_instr->parent);
}
descriptor_set = deref_instr->var->data.descriptor_set;
base_index = deref_instr->var->data.binding;
}
Temp list = load_desc_ptr(ctx, descriptor_set);
list = convert_pointer_to_64_bit(ctx, list);
struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
unsigned offset = binding->offset;
unsigned stride = binding->size;
aco_opcode opcode;
RegClass type;
assert(base_index < layout->binding_count);
switch (desc_type) {
case ACO_DESC_IMAGE:
type = s8;
opcode = aco_opcode::s_load_dwordx8;
break;
case ACO_DESC_FMASK:
type = s8;
opcode = aco_opcode::s_load_dwordx8;
offset += 32;
break;
case ACO_DESC_SAMPLER:
type = s4;
opcode = aco_opcode::s_load_dwordx4;
if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
offset += radv_combined_image_descriptor_sampler_offset(binding);
break;
case ACO_DESC_BUFFER:
type = s4;
opcode = aco_opcode::s_load_dwordx4;
break;
case ACO_DESC_PLANE_0:
case ACO_DESC_PLANE_1:
type = s8;
opcode = aco_opcode::s_load_dwordx8;
offset += 32 * (desc_type - ACO_DESC_PLANE_0);
break;
case ACO_DESC_PLANE_2:
type = s4;
opcode = aco_opcode::s_load_dwordx4;
offset += 64;
break;
default: unreachable("invalid desc_type\n");
}
offset += constant_index * stride;
if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
(!index_set || binding->immutable_samplers_equal)) {
if (binding->immutable_samplers_equal)
constant_index = 0;
const uint32_t* samplers = radv_immutable_samplers(layout, binding);
uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),
Operand::c32(samplers[constant_index * 4 + 1]),
Operand::c32(samplers[constant_index * 4 + 2]),
Operand::c32(samplers[constant_index * 4 + 3]));
}
Operand off;
if (!index_set) {
off = bld.copy(bld.def(s1), Operand::c32(offset));
} else {
off = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);
if (offset)
off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),
off);
}
Temp res = bld.smem(opcode, bld.def(type), list, off);
if (desc_type == ACO_DESC_PLANE_2) {
Temp components[8];
for (unsigned i = 0; i < 8; i++)
components[i] = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
Definition(components[2]), Definition(components[3]), res);
Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
Definition(components[4]), Definition(components[5]), Definition(components[6]),
Definition(components[7]), desc2);
res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
components[2], components[3], components[4], components[5], components[6],
components[7]);
} else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
!write) {
Temp components[8];
for (unsigned i = 0; i < 8; i++)
components[i] = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
Definition(components[2]), Definition(components[3]), Definition(components[4]),
Definition(components[5]), Definition(components[6]), Definition(components[7]),
res);
/* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
* hardware bug.
*/
components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));
res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
components[2], components[3], components[4], components[5], components[6],
components[7]);
} else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
Temp components[4];
for (unsigned i = 0; i < 4; i++)
components[i] = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
Definition(components[2]), Definition(components[3]), res);
/* We want to always use the linear filtering truncation behaviour for
* nir_texop_tg4, even if the sampler uses nearest/point filtering.
*/
components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
Operand::c32(C_008F30_TRUNC_COORD));
res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
components[2], components[3]);
}
return res;
}
static int
image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
{
@ -5925,10 +5717,10 @@ get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
coords[i] = emit_extract_vector(ctx, src0, i, v1);
}
if (instr->intrinsic == nir_intrinsic_image_deref_load ||
instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
instr->intrinsic == nir_intrinsic_image_deref_store) {
int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
instr->intrinsic == nir_intrinsic_bindless_image_store) {
int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
bool level_zero =
nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
@ -5980,15 +5772,13 @@ void
visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
{
Builder bld(ctx->program, ctx->block);
const nir_variable* var =
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
bool is_array = nir_intrinsic_image_array(instr);
bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
unsigned access = var->data.access | nir_intrinsic_access(instr);
unsigned access = nir_intrinsic_access(instr);
unsigned result_size = instr->dest.ssa.num_components - is_sparse;
unsigned expand_mask =
@ -6012,9 +5802,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
else
tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
nullptr, false);
Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
@ -6075,8 +5863,7 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
void
visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
{
const nir_variable* var =
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
Builder bld(ctx->program, ctx->block);
const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
bool is_array = nir_intrinsic_image_array(instr);
Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
@ -6087,15 +5874,14 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
data = as_vgpr(ctx, data);
memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
unsigned access = var->data.access | nir_intrinsic_access(instr);
unsigned access = nir_intrinsic_access(instr);
bool glc = ctx->options->chip_class == GFX6 ||
access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
? 1
: 0;
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
ACO_DESC_BUFFER, nullptr, true);
Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
aco_opcode opcode;
switch (data.size()) {
@ -6123,13 +5909,11 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
assert(data.type() == RegType::vgpr);
std::vector<Temp> coords = get_image_coords(ctx, instr);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
ACO_DESC_IMAGE, nullptr, true);
Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
Builder bld(ctx->program, ctx->block);
MIMG_instruction* store =
emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
store->glc = glc;
@ -6156,75 +5940,75 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
bool is_64bit = data.bytes() == 8;
assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
if (instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap)
data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
get_ssa_temp(ctx, instr->src[4].ssa), data);
aco_opcode buf_op, buf_op64, image_op;
switch (instr->intrinsic) {
case nir_intrinsic_image_deref_atomic_add:
case nir_intrinsic_bindless_image_atomic_add:
buf_op = aco_opcode::buffer_atomic_add;
buf_op64 = aco_opcode::buffer_atomic_add_x2;
image_op = aco_opcode::image_atomic_add;
break;
case nir_intrinsic_image_deref_atomic_umin:
case nir_intrinsic_bindless_image_atomic_umin:
buf_op = aco_opcode::buffer_atomic_umin;
buf_op64 = aco_opcode::buffer_atomic_umin_x2;
image_op = aco_opcode::image_atomic_umin;
break;
case nir_intrinsic_image_deref_atomic_imin:
case nir_intrinsic_bindless_image_atomic_imin:
buf_op = aco_opcode::buffer_atomic_smin;
buf_op64 = aco_opcode::buffer_atomic_smin_x2;
image_op = aco_opcode::image_atomic_smin;
break;
case nir_intrinsic_image_deref_atomic_umax:
case nir_intrinsic_bindless_image_atomic_umax:
buf_op = aco_opcode::buffer_atomic_umax;
buf_op64 = aco_opcode::buffer_atomic_umax_x2;
image_op = aco_opcode::image_atomic_umax;
break;
case nir_intrinsic_image_deref_atomic_imax:
case nir_intrinsic_bindless_image_atomic_imax:
buf_op = aco_opcode::buffer_atomic_smax;
buf_op64 = aco_opcode::buffer_atomic_smax_x2;
image_op = aco_opcode::image_atomic_smax;
break;
case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_bindless_image_atomic_and:
buf_op = aco_opcode::buffer_atomic_and;
buf_op64 = aco_opcode::buffer_atomic_and_x2;
image_op = aco_opcode::image_atomic_and;
break;
case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_bindless_image_atomic_or:
buf_op = aco_opcode::buffer_atomic_or;
buf_op64 = aco_opcode::buffer_atomic_or_x2;
image_op = aco_opcode::image_atomic_or;
break;
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_bindless_image_atomic_xor:
buf_op = aco_opcode::buffer_atomic_xor;
buf_op64 = aco_opcode::buffer_atomic_xor_x2;
image_op = aco_opcode::image_atomic_xor;
break;
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_exchange:
buf_op = aco_opcode::buffer_atomic_swap;
buf_op64 = aco_opcode::buffer_atomic_swap_x2;
image_op = aco_opcode::image_atomic_swap;
break;
case nir_intrinsic_image_deref_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_comp_swap:
buf_op = aco_opcode::buffer_atomic_cmpswap;
buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
image_op = aco_opcode::image_atomic_cmpswap;
break;
case nir_intrinsic_image_deref_atomic_fmin:
case nir_intrinsic_bindless_image_atomic_fmin:
buf_op = aco_opcode::buffer_atomic_fmin;
buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
image_op = aco_opcode::image_atomic_fmin;
break;
case nir_intrinsic_image_deref_atomic_fmax:
case nir_intrinsic_bindless_image_atomic_fmax:
buf_op = aco_opcode::buffer_atomic_fmax;
buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
image_op = aco_opcode::image_atomic_fmax;
break;
default:
unreachable("visit_image_atomic should only be called with "
"nir_intrinsic_image_deref_atomic_* instructions.");
"nir_intrinsic_bindless_image_atomic_* instructions.");
}
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@ -6232,8 +6016,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
ACO_DESC_BUFFER, nullptr, true);
Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
// assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
// implemented.");
aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
@ -6256,8 +6039,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
}
std::vector<Temp> coords = get_image_coords(ctx, instr);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
ACO_DESC_IMAGE, nullptr, true);
Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
Definition def = return_previous ? Definition(dst) : Definition();
MIMG_instruction* mimg =
emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
@ -6314,8 +6096,7 @@ visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
Builder bld(ctx->program, ctx->block);
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
ACO_DESC_BUFFER, NULL, false);
Temp desc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
}
@ -6324,8 +6105,7 @@ visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};
/* Resource */
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
ACO_DESC_IMAGE, NULL, false);
Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@ -6377,8 +6157,7 @@ visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
{
Builder bld(ctx->program, ctx->block);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
ACO_DESC_IMAGE, NULL, false);
Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
get_image_samples(ctx, Definition(dst), resource);
}
@ -8006,23 +7785,23 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
case nir_intrinsic_shared_atomic_fadd:
case nir_intrinsic_shared_atomic_fmin:
case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break;
case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
case nir_intrinsic_image_deref_atomic_add:
case nir_intrinsic_image_deref_atomic_umin:
case nir_intrinsic_image_deref_atomic_imin:
case nir_intrinsic_image_deref_atomic_umax:
case nir_intrinsic_image_deref_atomic_imax:
case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap:
case nir_intrinsic_image_deref_atomic_fmin:
case nir_intrinsic_image_deref_atomic_fmax: visit_image_atomic(ctx, instr); break;
case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
case nir_intrinsic_bindless_image_atomic_add:
case nir_intrinsic_bindless_image_atomic_umin:
case nir_intrinsic_bindless_image_atomic_imin:
case nir_intrinsic_bindless_image_atomic_umax:
case nir_intrinsic_bindless_image_atomic_imax:
case nir_intrinsic_bindless_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_xor:
case nir_intrinsic_bindless_image_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_fmin:
case nir_intrinsic_bindless_image_atomic_fmax: visit_image_atomic(ctx, instr); break;
case nir_intrinsic_bindless_image_size: visit_image_size(ctx, instr); break;
case nir_intrinsic_bindless_image_samples: visit_image_samples(ctx, instr); break;
case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
case nir_intrinsic_load_global_constant:

View File

@ -616,7 +616,7 @@ init_context(isel_context* ctx, nir_shader* shader)
case nir_intrinsic_load_ring_tess_offchip_offset_amd:
case nir_intrinsic_load_ring_esgs_amd:
case nir_intrinsic_load_ring_es2gs_offset_amd:
case nir_intrinsic_image_deref_samples:
case nir_intrinsic_bindless_image_samples:
case nir_intrinsic_has_input_vertex_amd:
case nir_intrinsic_has_input_primitive_amd:
case nir_intrinsic_load_workgroup_num_input_vertices_amd:
@ -686,19 +686,19 @@ init_context(isel_context* ctx, nir_shader* shader)
case nir_intrinsic_global_atomic_comp_swap:
case nir_intrinsic_global_atomic_fmin:
case nir_intrinsic_global_atomic_fmax:
case nir_intrinsic_image_deref_atomic_add:
case nir_intrinsic_image_deref_atomic_umin:
case nir_intrinsic_image_deref_atomic_imin:
case nir_intrinsic_image_deref_atomic_umax:
case nir_intrinsic_image_deref_atomic_imax:
case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap:
case nir_intrinsic_image_deref_atomic_fmin:
case nir_intrinsic_image_deref_atomic_fmax:
case nir_intrinsic_image_deref_size:
case nir_intrinsic_bindless_image_atomic_add:
case nir_intrinsic_bindless_image_atomic_umin:
case nir_intrinsic_bindless_image_atomic_imin:
case nir_intrinsic_bindless_image_atomic_umax:
case nir_intrinsic_bindless_image_atomic_imax:
case nir_intrinsic_bindless_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_xor:
case nir_intrinsic_bindless_image_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_fmin:
case nir_intrinsic_bindless_image_atomic_fmax:
case nir_intrinsic_bindless_image_size:
case nir_intrinsic_shared_atomic_add:
case nir_intrinsic_shared_atomic_imin:
case nir_intrinsic_shared_atomic_umin:

View File

@ -32,6 +32,7 @@ typedef struct {
enum chip_class chip_class;
uint32_t address32_hi;
bool disable_aniso_single_level;
bool has_image_load_dcc_bug;
const struct radv_shader_args *args;
const struct radv_shader_info *info;
@ -222,7 +223,8 @@ visit_get_ssbo_size(nir_builder *b, apply_layout_state *state, nir_intrinsic_ins
static nir_ssa_def *
get_sampler_desc(nir_builder *b, apply_layout_state *state, nir_deref_instr *deref,
enum ac_descriptor_type desc_type, bool non_uniform, nir_tex_instr *tex)
enum ac_descriptor_type desc_type, bool non_uniform, nir_tex_instr *tex,
bool write)
{
nir_variable *var = nir_deref_instr_get_variable(deref);
assert(var);
@ -311,7 +313,8 @@ get_sampler_desc(nir_builder *b, apply_layout_state *state, nir_deref_instr *der
* use the tail from plane 1 so that we can store only the first 16 bytes
* of the last plane. */
if (desc_type == AC_DESC_PLANE_2) {
nir_ssa_def *desc2 = get_sampler_desc(b, state, deref, AC_DESC_PLANE_1, non_uniform, tex);
nir_ssa_def *desc2 =
get_sampler_desc(b, state, deref, AC_DESC_PLANE_1, non_uniform, tex, write);
nir_ssa_def *comp[8];
for (unsigned i = 0; i < 4; i++)
@ -319,6 +322,17 @@ get_sampler_desc(nir_builder *b, apply_layout_state *state, nir_deref_instr *der
for (unsigned i = 4; i < 8; i++)
comp[i] = nir_channel(b, desc2, i);
return nir_vec(b, comp, 8);
} else if (desc_type == AC_DESC_IMAGE && state->has_image_load_dcc_bug && !tex && !write) {
nir_ssa_def *comp[8];
for (unsigned i = 0; i < 8; i++)
comp[i] = nir_channel(b, desc, i);
/* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
* hardware bug.
*/
comp[6] = nir_iand_imm(b, comp[6], C_00A018_WRITE_COMPRESS_ENABLE);
return nir_vec(b, comp, 8);
} else if (desc_type == AC_DESC_SAMPLER && tex->op == nir_texop_tg4) {
nir_ssa_def *comp[4];
@ -336,6 +350,20 @@ get_sampler_desc(nir_builder *b, apply_layout_state *state, nir_deref_instr *der
return desc;
}
static void
update_image_intrinsic(nir_builder *b, apply_layout_state *state, nir_intrinsic_instr *intrin)
{
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
const enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type);
bool is_load = intrin->intrinsic == nir_intrinsic_image_deref_load ||
intrin->intrinsic == nir_intrinsic_image_deref_sparse_load;
nir_ssa_def *desc = get_sampler_desc(
b, state, deref, dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE,
nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM, NULL, !is_load);
nir_rewrite_image_intrinsic(intrin, desc, true);
}
static void
apply_layout_to_intrin(nir_builder *b, apply_layout_state *state, nir_intrinsic_instr *intrin)
{
@ -376,6 +404,28 @@ apply_layout_to_intrin(nir_builder *b, apply_layout_state *state, nir_intrinsic_
case nir_intrinsic_get_ssbo_size:
visit_get_ssbo_size(b, state, intrin);
break;
case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_sparse_load:
case nir_intrinsic_image_deref_store:
case nir_intrinsic_image_deref_atomic_add:
case nir_intrinsic_image_deref_atomic_imin:
case nir_intrinsic_image_deref_atomic_umin:
case nir_intrinsic_image_deref_atomic_fmin:
case nir_intrinsic_image_deref_atomic_imax:
case nir_intrinsic_image_deref_atomic_umax:
case nir_intrinsic_image_deref_atomic_fmax:
case nir_intrinsic_image_deref_atomic_and:
case nir_intrinsic_image_deref_atomic_or:
case nir_intrinsic_image_deref_atomic_xor:
case nir_intrinsic_image_deref_atomic_exchange:
case nir_intrinsic_image_deref_atomic_comp_swap:
case nir_intrinsic_image_deref_atomic_fadd:
case nir_intrinsic_image_deref_atomic_inc_wrap:
case nir_intrinsic_image_deref_atomic_dec_wrap:
case nir_intrinsic_image_deref_size:
case nir_intrinsic_image_deref_samples:
update_image_intrinsic(b, state, intrin);
break;
default:
break;
}
@ -412,22 +462,22 @@ apply_layout_to_tex(nir_builder *b, apply_layout_state *state, nir_tex_instr *te
assert(tex->op != nir_texop_txf_ms && tex->op != nir_texop_samples_identical);
assert(tex->sampler_dim != GLSL_SAMPLER_DIM_BUF);
image = get_sampler_desc(b, state, texture_deref_instr, AC_DESC_PLANE_0 + plane,
tex->texture_non_uniform, tex);
tex->texture_non_uniform, tex, false);
} else if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
image = get_sampler_desc(b, state, texture_deref_instr, AC_DESC_BUFFER,
tex->texture_non_uniform, tex);
tex->texture_non_uniform, tex, false);
} else if (tex->op == nir_texop_fragment_mask_fetch_amd ||
tex->op == nir_texop_samples_identical) {
image = get_sampler_desc(b, state, texture_deref_instr, AC_DESC_FMASK,
tex->texture_non_uniform, tex);
tex->texture_non_uniform, tex, false);
} else {
image = get_sampler_desc(b, state, texture_deref_instr, AC_DESC_IMAGE,
tex->texture_non_uniform, tex);
tex->texture_non_uniform, tex, false);
}
if (sampler_deref_instr) {
sampler = get_sampler_desc(b, state, sampler_deref_instr, AC_DESC_SAMPLER,
tex->sampler_non_uniform, tex);
tex->sampler_non_uniform, tex, false);
if (state->disable_aniso_single_level && tex->sampler_dim < GLSL_SAMPLER_DIM_RECT &&
state->chip_class < GFX8) {
@ -479,6 +529,7 @@ radv_nir_apply_pipeline_layout(nir_shader *shader, struct radv_device *device,
.chip_class = device->physical_device->rad_info.chip_class,
.address32_hi = device->physical_device->rad_info.address32_hi,
.disable_aniso_single_level = device->instance->disable_aniso_single_level,
.has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug,
.args = args,
.info = info,
.pipeline_layout = layout,

View File

@ -1889,7 +1889,6 @@ shader_compile(struct radv_device *device, struct vk_shader_module *module,
options->enable_mrt_output_nan_fixup =
module && !is_meta_shader(module->nir) && options->key.ps.enable_mrt_output_nan_fixup;
options->adjust_frag_coord_z = options->key.adjust_frag_coord_z;
options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug;
options->debug.func = radv_compiler_debug;
options->debug.private_data = &debug_data;

View File

@ -122,7 +122,6 @@ struct radv_nir_compiler_options {
bool record_stats;
bool check_ir;
bool has_ls_vgpr_init_bug;
bool has_image_load_dcc_bug;
bool enable_mrt_output_nan_fixup;
bool wgp_mode;
enum radeon_family family;