nir: add load_shared2_amd and store_shared2_amd
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13778>
This commit is contained in:
parent
bbdf22ce13
commit
8ff122f8b8
|
@ -331,6 +331,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
|
|||
case nir_intrinsic_byte_permute_amd:
|
||||
case nir_intrinsic_load_deref:
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_load_shared2_amd:
|
||||
case nir_intrinsic_load_global:
|
||||
case nir_intrinsic_load_global_constant:
|
||||
case nir_intrinsic_load_global_amd:
|
||||
|
|
|
@ -226,6 +226,14 @@ index("bool", "is_swizzled")
|
|||
# The SLC ("system level coherent") bit of load_buffer_amd/store_buffer_amd
|
||||
index("bool", "slc_amd")
|
||||
|
||||
# Offsets for load_shared2_amd/store_shared2_amd
|
||||
index("uint8_t", "offset0")
|
||||
index("uint8_t", "offset1")
|
||||
|
||||
# If true, both offsets have an additional stride of 64 dwords (ie. they are multiplied by 256 bytes
|
||||
# in hardware, instead of 4).
|
||||
index("bool", "st64")
|
||||
|
||||
# Separate source/dest access flags for copies
|
||||
index("enum gl_access_qualifier", "dst_access")
|
||||
index("enum gl_access_qualifier", "src_access")
|
||||
|
@ -1346,6 +1354,12 @@ intrinsic("load_smem_amd", src_comp=[1, 1], dest_comp=0, bit_sizes=[32],
|
|||
indices=[ALIGN_MUL, ALIGN_OFFSET],
|
||||
flags=[CAN_ELIMINATE, CAN_REORDER])
|
||||
|
||||
# src[] = { offset }.
|
||||
intrinsic("load_shared2_amd", [1], dest_comp=2, indices=[OFFSET0, OFFSET1, ST64], flags=[CAN_ELIMINATE])
|
||||
|
||||
# src[] = { value, offset }.
|
||||
intrinsic("store_shared2_amd", [2, 1], indices=[OFFSET0, OFFSET1, ST64])
|
||||
|
||||
# V3D-specific instrinc for tile buffer color reads.
|
||||
#
|
||||
# The hardware requires that we read the samples and components of a pixel
|
||||
|
|
|
@ -77,6 +77,7 @@ shader_writes_to_memory(nir_shader *shader)
|
|||
case nir_intrinsic_ssbo_atomic_exchange:
|
||||
case nir_intrinsic_ssbo_atomic_comp_swap:
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_store_shared2_amd:
|
||||
case nir_intrinsic_shared_atomic_add:
|
||||
case nir_intrinsic_shared_atomic_imin:
|
||||
case nir_intrinsic_shared_atomic_umin:
|
||||
|
|
|
@ -262,6 +262,7 @@ node_is_dead(nir_cf_node *node)
|
|||
return false;
|
||||
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_load_shared2_amd:
|
||||
case nir_intrinsic_load_output:
|
||||
case nir_intrinsic_load_per_vertex_output:
|
||||
/* Same as above loads. */
|
||||
|
|
|
@ -143,6 +143,39 @@ try_fold_load_store(nir_builder *b,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
try_fold_shared2(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin,
|
||||
opt_offsets_state *state,
|
||||
unsigned offset_src_idx)
|
||||
{
|
||||
unsigned comp_size = (intrin->intrinsic == nir_intrinsic_load_shared2_amd ?
|
||||
intrin->dest.ssa.bit_size : intrin->src[0].ssa->bit_size) / 8;
|
||||
unsigned stride = (nir_intrinsic_st64(intrin) ? 64 : 1) * comp_size;
|
||||
unsigned offset0 = nir_intrinsic_offset0(intrin) * stride;
|
||||
unsigned offset1 = nir_intrinsic_offset1(intrin) * stride;
|
||||
nir_src *off_src = &intrin->src[offset_src_idx];
|
||||
|
||||
if (!nir_src_is_const(*off_src))
|
||||
return false;
|
||||
|
||||
unsigned const_offset = nir_src_as_uint(*off_src);
|
||||
offset0 += const_offset;
|
||||
offset1 += const_offset;
|
||||
bool st64 = offset0 % (64 * comp_size) == 0 && offset1 % (64 * comp_size) == 0;
|
||||
stride = (st64 ? 64 : 1) * comp_size;
|
||||
if (const_offset % stride || offset0 > 255 * stride || offset1 > 255 * stride)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
nir_instr_rewrite_src(&intrin->instr, off_src, nir_src_for_ssa(nir_imm_zero(b, 1, 32)));
|
||||
nir_intrinsic_set_offset0(intrin, offset0 / stride);
|
||||
nir_intrinsic_set_offset1(intrin, offset1 / stride);
|
||||
nir_intrinsic_set_st64(intrin, st64);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
process_instr(nir_builder *b, nir_instr *instr, void *s)
|
||||
{
|
||||
|
@ -163,6 +196,10 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
|
|||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_store_shared_ir3:
|
||||
return try_fold_load_store(b, intrin, state, 1, state->options->shared_max);
|
||||
case nir_intrinsic_load_shared2_amd:
|
||||
return try_fold_shared2(b, intrin, state, 0);
|
||||
case nir_intrinsic_store_shared2_amd:
|
||||
return try_fold_shared2(b, intrin, state, 1);
|
||||
case nir_intrinsic_load_buffer_amd:
|
||||
return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max);
|
||||
case nir_intrinsic_store_buffer_amd:
|
||||
|
|
|
@ -391,6 +391,7 @@ nir_schedule_intrinsic_deps(nir_deps_state *state,
|
|||
break;
|
||||
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_load_shared2_amd:
|
||||
/* Don't move load_shared beyond a following store_shared, as it could
|
||||
* change their value
|
||||
*/
|
||||
|
@ -398,6 +399,7 @@ nir_schedule_intrinsic_deps(nir_deps_state *state,
|
|||
break;
|
||||
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_store_shared2_amd:
|
||||
add_write_dep(state, &state->store_shared, n);
|
||||
break;
|
||||
|
||||
|
|
Loading…
Reference in New Issue