mirror of https://gitlab.freedesktop.org/mesa/mesa
radeonsi: minor simplifications of clear/copy_buffer shaders
- always use L2_LRU (never use ACCESS_NON_TEMPORAL) - for better perf - never use ACCESS_COHERENT because the address might not be aligned to a cache line - assume the wave size is always 64 Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
This commit is contained in:
parent
81c90cded0
commit
92497d1c8f
|
@ -192,7 +192,7 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
|
|||
unsigned writeable_bitmask)
|
||||
{
|
||||
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, L2_LRU);
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
|
@ -351,16 +351,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
|
|||
sb[0].buffer_offset = dst_offset;
|
||||
sb[0].buffer_size = size;
|
||||
|
||||
bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
|
||||
|
||||
if (src) {
|
||||
sb[1].buffer = src;
|
||||
sb[1].buffer_offset = src_offset;
|
||||
sb[1].buffer_size = size;
|
||||
|
||||
if (!sctx->cs_copy_buffer) {
|
||||
sctx->cs_copy_buffer = si_create_dma_compute_shader(
|
||||
sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
|
||||
sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD,
|
||||
true);
|
||||
}
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
|
||||
|
@ -373,8 +371,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
|
|||
sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
|
||||
|
||||
if (!sctx->cs_clear_buffer) {
|
||||
sctx->cs_clear_buffer = si_create_dma_compute_shader(
|
||||
sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
|
||||
sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD,
|
||||
false);
|
||||
}
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
|
||||
|
|
|
@ -49,8 +49,6 @@ struct ac_llvm_compiler;
|
|||
/* Tunables for compute-based clear_buffer and copy_buffer: */
|
||||
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
|
||||
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
|
||||
/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
|
||||
#define SI_COMPUTE_DST_CACHE_POLICY L2_LRU
|
||||
|
||||
/* Pipeline & streamout query controls. */
|
||||
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
|
||||
|
@ -1729,7 +1727,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
|||
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
||||
unsigned num_layers);
|
||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||
bool dst_stream_cache_policy, bool is_copy);
|
||||
bool is_copy);
|
||||
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
|
||||
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
||||
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
||||
|
|
|
@ -231,9 +231,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
|
|||
/* data |= clear_value_masked; */
|
||||
data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));
|
||||
|
||||
nir_store_ssbo(&b, data, zero, address,
|
||||
.access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0,
|
||||
.align_mul = 4);
|
||||
nir_store_ssbo(&b, data, zero, address, .align_mul = 4);
|
||||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
@ -645,25 +643,16 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
|
|||
nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12);
|
||||
nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3);
|
||||
|
||||
nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset,
|
||||
.access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0);
|
||||
nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset);
|
||||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
||||
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
|
||||
{
|
||||
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
|
||||
|
||||
/* Don't cache loads, because there is no reuse. */
|
||||
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
|
||||
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
|
||||
"ubyte_to_ushort");
|
||||
|
||||
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
|
||||
|
||||
b.shader->info.workgroup_size[0] = default_wave_size;
|
||||
b.shader->info.workgroup_size[0] = 64;
|
||||
b.shader->info.workgroup_size[1] = 1;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
b.shader->info.num_ssbos = 2;
|
||||
|
@ -672,32 +661,24 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
|
|||
nir_def *store_address = nir_imul_imm(&b, load_address, 2);
|
||||
|
||||
nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
|
||||
load_address, .access = load_qualifier);
|
||||
nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
|
||||
store_address, .access = store_qualifier);
|
||||
load_address, .access = ACCESS_RESTRICT);
|
||||
nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0),
|
||||
store_address, .access = ACCESS_RESTRICT);
|
||||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
||||
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||
bool dst_stream_cache_policy, bool is_copy)
|
||||
bool is_copy)
|
||||
{
|
||||
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
|
||||
|
||||
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
|
||||
if (dst_stream_cache_policy)
|
||||
store_qualifier |= ACCESS_NON_TEMPORAL;
|
||||
|
||||
/* Don't cache loads, because there is no reuse. */
|
||||
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
|
||||
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
|
||||
"create_dma_compute");
|
||||
unsigned wg_size = 64;
|
||||
|
||||
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
|
||||
|
||||
b.shader->info.workgroup_size[0] = default_wave_size;
|
||||
b.shader->info.workgroup_size[0] = wg_size;
|
||||
b.shader->info.workgroup_size[1] = 1;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
b.shader->info.num_ssbos = 1;
|
||||
|
@ -717,7 +698,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
|
|||
*/
|
||||
nir_def *store_address =
|
||||
nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0),
|
||||
default_wave_size * num_mem_ops),
|
||||
wg_size * num_mem_ops),
|
||||
nir_channel(&b, nir_load_local_invocation_id(&b), 0));
|
||||
|
||||
/* Convert from a "store size unit" into bytes. */
|
||||
|
@ -741,19 +722,19 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
|
|||
if (is_copy && i < num_mem_ops) {
|
||||
if (i) {
|
||||
load_address = nir_iadd(&b, load_address,
|
||||
nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
|
||||
nir_imm_int(&b, 4 * inst_dwords[i] * wg_size));
|
||||
}
|
||||
values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address,
|
||||
.access = load_qualifier);
|
||||
.access = ACCESS_RESTRICT);
|
||||
}
|
||||
|
||||
if (d >= 0) {
|
||||
if (d) {
|
||||
store_address = nir_iadd(&b, store_address,
|
||||
nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
|
||||
nir_imm_int(&b, 4 * inst_dwords[d] * wg_size));
|
||||
}
|
||||
nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
|
||||
.access = store_qualifier);
|
||||
.access = ACCESS_RESTRICT);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -120,8 +120,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
|
||||
void *compute_shader = NULL;
|
||||
if (test_cs) {
|
||||
compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
|
||||
cache_policy == L2_STREAM, is_copy);
|
||||
compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread, is_copy);
|
||||
}
|
||||
|
||||
double score = 0;
|
||||
|
|
Loading…
Reference in New Issue