radeonsi: implement DCC MSAA 4x/8x fast clear using DCC equations on gfx9

MSAA 4x and 8x should only clear the first 2 samples because other samples
are uncompressed. The compute shader only clears that subset of DCC.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10003>
This commit is contained in:
Marek Olšák 2021-04-04 16:58:29 -04:00 committed by Marge Bot
parent 8b95f51ef1
commit 3120113ee7
6 changed files with 158 additions and 4 deletions

View File

@ -255,6 +255,13 @@ static bool one_dcc_address_test(const char *name, const char *test, ADDR_HANDLE
addr = gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight,
dout.metaBlkDepth, dout.pitch, dout.height,
in.x, in.y, in.slice, in.sample, in.pipeXor);
if (in.sample == 1) {
/* Sample 0 should be one byte before sample 1. The DCC MSAA clear relies on it. */
assert(addr - 1 ==
gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight,
dout.metaBlkDepth, dout.pitch, dout.height,
in.x, in.y, in.slice, 0, in.pipeXor));
}
} else {
addr = gfx10_dcc_addr_from_coord(info, dout.equation.gfx10_bits,
in.bpp, dout.metaBlkWidth, dout.metaBlkHeight,

View File

@ -43,6 +43,7 @@ void si_init_buffer_clear(struct si_clear_info *info,
info->size = size;
info->clear_value = clear_value;
info->writemask = 0xffffffff;
info->is_dcc_msaa = false;
}
static void si_init_buffer_clear_rmw(struct si_clear_info *info,
@ -75,6 +76,12 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
/* Execute clears. */
for (unsigned i = 0; i < num_clears; i++) {
if (info[i].is_dcc_msaa) {
gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value,
SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
continue;
}
assert(info[i].size > 0);
if (info[i].writemask != 0xffffffff) {
@ -328,10 +335,13 @@ bool vi_dcc_get_clear_info(struct si_context *sctx, struct si_texture *tex, unsi
if (tex->buffer.b.b.last_level > 0)
return false;
/* 4x and 8x MSAA needs a sophisticated compute shader for
* the clear. See AMDVLK. */
if (tex->buffer.b.b.nr_storage_samples >= 4)
return false;
/* 4x and 8x MSAA need to clear only sample 0 and 1 in a compute shader and leave other
* samples untouched. (only the first 2 samples are compressed) */
if (tex->buffer.b.b.nr_storage_samples >= 4) {
si_init_buffer_clear(out, dcc_buffer, 0, 0, clear_value);
out->is_dcc_msaa = true;
return true;
}
clear_size = tex->surface.meta_size;
} else {

View File

@ -725,6 +725,78 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
pipe_resource_reference(&saved_sb.buffer, NULL);
}
void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
unsigned flags, enum si_coherency coher)
{
struct pipe_context *ctx = &sctx->b;
struct si_texture *tex = (struct si_texture*)res;
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
/* Save states. */
void *saved_cs = sctx->cs_shader_state.program;
struct pipe_shader_buffer saved_sb = {};
si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
unsigned saved_writable_mask = 0;
if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
(1u << si_get_shaderbuf_slot(0)))
saved_writable_mask |= 1 << 0;
/* Set the DCC buffer. */
assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX);
assert(tex->buffer.bo_size <= UINT_MAX);
struct pipe_shader_buffer sb = {};
sb.buffer = &tex->buffer.b.b;
sb.buffer_offset = tex->surface.meta_offset;
sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset;
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
sctx->cs_user_data[0] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) |
(tex->surface.u.gfx9.color.dcc_height << 16);
sctx->cs_user_data[1] = (clear_value & 0xffff) |
((uint32_t)tex->surface.tile_swizzle << 16);
/* These variables identify the shader variant. */
unsigned swizzle_mode = tex->surface.u.gfx9.swizzle_mode;
unsigned bpe_log2 = util_logbase2(tex->surface.bpe);
bool samples8 = tex->buffer.b.b.nr_storage_samples == 8;
bool is_array = tex->buffer.b.b.array_size > 1;
void **shader = &sctx->cs_clear_dcc_msaa[swizzle_mode][bpe_log2][samples8][is_array];
if (!*shader)
*shader = gfx9_create_clear_dcc_msaa_cs(sctx, tex);
ctx->bind_compute_state(ctx, *shader);
/* Dispatch compute. */
unsigned width = DIV_ROUND_UP(tex->buffer.b.b.width0, tex->surface.u.gfx9.color.dcc_block_width);
unsigned height = DIV_ROUND_UP(tex->buffer.b.b.height0, tex->surface.u.gfx9.color.dcc_block_height);
unsigned depth = DIV_ROUND_UP(tex->buffer.b.b.array_size, tex->surface.u.gfx9.color.dcc_block_depth);
struct pipe_grid_info info = {};
info.block[0] = 8;
info.block[1] = 8;
info.block[2] = 1;
info.last_block[0] = width % info.block[0];
info.last_block[1] = height % info.block[1];
info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
info.grid[2] = depth;
si_launch_grid_internal(sctx, &info, saved_cs, flags);
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, tex->surface.meta_size);
if (flags & SI_OP_SYNC_AFTER)
sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
/* Restore states. */
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
pipe_resource_reference(&saved_sb.buffer, NULL);
}
/* Expand FMASK to make it identity, so that image stores can ignore it. */
void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
{

View File

@ -268,6 +268,17 @@ static void si_destroy_context(struct pipe_context *context)
}
}
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) {
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) {
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) {
for (unsigned l = 0; l < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j][k]); l++) {
if (sctx->cs_clear_dcc_msaa[i][j][k][l])
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_dcc_msaa[i][j][k][l]);
}
}
}
}
if (sctx->blitter)
util_blitter_destroy(sctx->blitter);

View File

@ -1316,6 +1316,10 @@ struct si_context {
bool thread_trace_enabled;
unsigned context_flags;
/* Shaders. */
/* TODO: move other shaders here too */
void *cs_clear_dcc_msaa[32][5][2][2]; /* [swizzle_mode][log2(bpe)][samples == 8][is_array] */
};
/* si_blit.c */
@ -1368,6 +1372,7 @@ struct si_clear_info {
uint32_t size;
uint32_t clear_value;
uint32_t writemask;
bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */
};
enum pipe_format si_simplify_cb_format(enum pipe_format format);
@ -1423,6 +1428,8 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
unsigned dsty, unsigned width, unsigned height,
bool render_condition_enabled);
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
unsigned flags, enum si_coherency coher);
void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
void si_init_compute_blit_functions(struct si_context *sctx);
@ -1539,6 +1546,7 @@ void si_resume_queries(struct si_context *sctx);
/* si_shaderlib_nir.c */
void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
/* si_shaderlib_tgsi.c */
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,

View File

@ -100,3 +100,49 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
return create_nir_cs(sctx, &b);
}
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
{
const nir_shader_compiler_options *options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_dcc_msaa");
b.shader->info.cs.local_size[0] = 8;
b.shader->info.cs.local_size[1] = 8;
b.shader->info.cs.local_size[2] = 1;
b.shader->info.cs.user_data_components_amd = 2;
b.shader->info.num_ssbos = 1;
/* Get user data SGPRs. */
nir_ssa_def *user_sgprs = nir_load_user_data_amd(&b);
nir_ssa_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor;
unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height);
unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor);
clear_value = nir_u2u16(&b, clear_value);
/* Get the 2D coordinates. */
nir_ssa_def *coord = get_global_ids(&b, 3);
nir_ssa_def *zero = nir_imm_int(&b, 0);
/* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
coord = nir_imul(&b, coord,
nir_channels(&b, nir_imm_ivec4(&b, tex->surface.u.gfx9.color.dcc_block_width,
tex->surface.u.gfx9.color.dcc_block_height,
tex->surface.u.gfx9.color.dcc_block_depth, 0), 0x7));
nir_ssa_def *offset =
ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe,
&tex->surface.u.gfx9.color.dcc_equation,
dcc_pitch, dcc_height, zero, /* DCC slice size */
nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */
zero, pipe_xor); /* sample, pipe_xor */
/* The trick here is that DCC elements for an even and the next odd sample are next to each other
* in memory, so we only need to compute the address for sample 0 and the next DCC byte is always
* sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time.
*/
nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
return create_nir_cs(sctx, &b);
}