radeonsi: minor simplifications of clear/copy_buffer shaders

- always use L2_LRU (never use ACCESS_NON_TEMPORAL) - for better perf
- never use ACCESS_COHERENT because the address might not be aligned to
  a cache line
- assume the wave size is always 64

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
This commit is contained in:
Marek Olšák 2024-04-26 19:44:11 -04:00 committed by Marge Bot
parent 81c90cded0
commit 92497d1c8f
4 changed files with 21 additions and 45 deletions

View File

@ -192,7 +192,7 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
unsigned writeable_bitmask)
{
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
sctx->flags |= si_get_flush_flags(sctx, coher, L2_LRU);
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
@ -351,16 +351,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
sb[0].buffer_offset = dst_offset;
sb[0].buffer_size = size;
bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
if (src) {
sb[1].buffer = src;
sb[1].buffer_offset = src_offset;
sb[1].buffer_size = size;
if (!sctx->cs_copy_buffer) {
sctx->cs_copy_buffer = si_create_dma_compute_shader(
sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD,
true);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
@ -373,8 +371,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
if (!sctx->cs_clear_buffer) {
sctx->cs_clear_buffer = si_create_dma_compute_shader(
sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD,
false);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,

View File

@ -49,8 +49,6 @@ struct ac_llvm_compiler;
/* Tunables for compute-based clear_buffer and copy_buffer: */
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
#define SI_COMPUTE_DST_CACHE_POLICY L2_LRU
/* Pipeline & streamout query controls. */
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
@ -1729,7 +1727,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
unsigned num_layers);
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy);
bool is_copy);
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);

View File

@ -231,9 +231,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
/* data |= clear_value_masked; */
data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));
nir_store_ssbo(&b, data, zero, address,
.access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0,
.align_mul = 4);
nir_store_ssbo(&b, data, zero, address, .align_mul = 4);
return create_shader_state(sctx, b.shader);
}
@ -645,25 +643,16 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12);
nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3);
nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset,
.access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0);
nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset);
return create_shader_state(sctx, b.shader);
}
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
{
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
/* Don't cache loads, because there is no reuse. */
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
"ubyte_to_ushort");
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
b.shader->info.workgroup_size[0] = default_wave_size;
b.shader->info.workgroup_size[0] = 64;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ssbos = 2;
@ -672,32 +661,24 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
nir_def *store_address = nir_imul_imm(&b, load_address, 2);
nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
load_address, .access = load_qualifier);
nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
store_address, .access = store_qualifier);
load_address, .access = ACCESS_RESTRICT);
nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0),
store_address, .access = ACCESS_RESTRICT);
return create_shader_state(sctx, b.shader);
}
/* Create a compute shader implementing clear_buffer or copy_buffer. */
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy)
bool is_copy)
{
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
if (dst_stream_cache_policy)
store_qualifier |= ACCESS_NON_TEMPORAL;
/* Don't cache loads, because there is no reuse. */
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
"create_dma_compute");
unsigned wg_size = 64;
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
b.shader->info.workgroup_size[0] = default_wave_size;
b.shader->info.workgroup_size[0] = wg_size;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ssbos = 1;
@ -717,7 +698,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
*/
nir_def *store_address =
nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0),
default_wave_size * num_mem_ops),
wg_size * num_mem_ops),
nir_channel(&b, nir_load_local_invocation_id(&b), 0));
/* Convert from a "store size unit" into bytes. */
@ -741,19 +722,19 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
if (is_copy && i < num_mem_ops) {
if (i) {
load_address = nir_iadd(&b, load_address,
nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
nir_imm_int(&b, 4 * inst_dwords[i] * wg_size));
}
values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address,
.access = load_qualifier);
.access = ACCESS_RESTRICT);
}
if (d >= 0) {
if (d) {
store_address = nir_iadd(&b, store_address,
nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
nir_imm_int(&b, 4 * inst_dwords[d] * wg_size));
}
nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
.access = store_qualifier);
.access = ACCESS_RESTRICT);
}
}

View File

@ -120,8 +120,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
void *compute_shader = NULL;
if (test_cs) {
compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
cache_policy == L2_STREAM, is_copy);
compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread, is_copy);
}
double score = 0;