radeonsi: implement DCC MSAA 4x/8x fast clear using DCC equations on gfx9
MSAA 4x and 8x should only clear the first 2 samples because other samples are uncompressed. The compute shader only clears that subset of DCC. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10003>
This commit is contained in:
parent
8b95f51ef1
commit
3120113ee7
|
@ -255,6 +255,13 @@ static bool one_dcc_address_test(const char *name, const char *test, ADDR_HANDLE
|
|||
addr = gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight,
|
||||
dout.metaBlkDepth, dout.pitch, dout.height,
|
||||
in.x, in.y, in.slice, in.sample, in.pipeXor);
|
||||
if (in.sample == 1) {
|
||||
/* Sample 0 should be one byte before sample 1. The DCC MSAA clear relies on it. */
|
||||
assert(addr - 1 ==
|
||||
gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight,
|
||||
dout.metaBlkDepth, dout.pitch, dout.height,
|
||||
in.x, in.y, in.slice, 0, in.pipeXor));
|
||||
}
|
||||
} else {
|
||||
addr = gfx10_dcc_addr_from_coord(info, dout.equation.gfx10_bits,
|
||||
in.bpp, dout.metaBlkWidth, dout.metaBlkHeight,
|
||||
|
|
|
@ -43,6 +43,7 @@ void si_init_buffer_clear(struct si_clear_info *info,
|
|||
info->size = size;
|
||||
info->clear_value = clear_value;
|
||||
info->writemask = 0xffffffff;
|
||||
info->is_dcc_msaa = false;
|
||||
}
|
||||
|
||||
static void si_init_buffer_clear_rmw(struct si_clear_info *info,
|
||||
|
@ -75,6 +76,12 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
|
|||
|
||||
/* Execute clears. */
|
||||
for (unsigned i = 0; i < num_clears; i++) {
|
||||
if (info[i].is_dcc_msaa) {
|
||||
gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value,
|
||||
SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(info[i].size > 0);
|
||||
|
||||
if (info[i].writemask != 0xffffffff) {
|
||||
|
@ -328,10 +335,13 @@ bool vi_dcc_get_clear_info(struct si_context *sctx, struct si_texture *tex, unsi
|
|||
if (tex->buffer.b.b.last_level > 0)
|
||||
return false;
|
||||
|
||||
/* 4x and 8x MSAA needs a sophisticated compute shader for
|
||||
* the clear. See AMDVLK. */
|
||||
if (tex->buffer.b.b.nr_storage_samples >= 4)
|
||||
return false;
|
||||
/* 4x and 8x MSAA need to clear only sample 0 and 1 in a compute shader and leave other
|
||||
* samples untouched. (only the first 2 samples are compressed) */
|
||||
if (tex->buffer.b.b.nr_storage_samples >= 4) {
|
||||
si_init_buffer_clear(out, dcc_buffer, 0, 0, clear_value);
|
||||
out->is_dcc_msaa = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
clear_size = tex->surface.meta_size;
|
||||
} else {
|
||||
|
|
|
@ -725,6 +725,78 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
|
|||
pipe_resource_reference(&saved_sb.buffer, NULL);
|
||||
}
|
||||
|
||||
void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
|
||||
unsigned flags, enum si_coherency coher)
|
||||
{
|
||||
struct pipe_context *ctx = &sctx->b;
|
||||
struct si_texture *tex = (struct si_texture*)res;
|
||||
|
||||
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
|
||||
|
||||
/* Save states. */
|
||||
void *saved_cs = sctx->cs_shader_state.program;
|
||||
struct pipe_shader_buffer saved_sb = {};
|
||||
si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
|
||||
|
||||
unsigned saved_writable_mask = 0;
|
||||
if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
|
||||
(1u << si_get_shaderbuf_slot(0)))
|
||||
saved_writable_mask |= 1 << 0;
|
||||
|
||||
/* Set the DCC buffer. */
|
||||
assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX);
|
||||
assert(tex->buffer.bo_size <= UINT_MAX);
|
||||
|
||||
struct pipe_shader_buffer sb = {};
|
||||
sb.buffer = &tex->buffer.b.b;
|
||||
sb.buffer_offset = tex->surface.meta_offset;
|
||||
sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset;
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
|
||||
|
||||
sctx->cs_user_data[0] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) |
|
||||
(tex->surface.u.gfx9.color.dcc_height << 16);
|
||||
sctx->cs_user_data[1] = (clear_value & 0xffff) |
|
||||
((uint32_t)tex->surface.tile_swizzle << 16);
|
||||
|
||||
/* These variables identify the shader variant. */
|
||||
unsigned swizzle_mode = tex->surface.u.gfx9.swizzle_mode;
|
||||
unsigned bpe_log2 = util_logbase2(tex->surface.bpe);
|
||||
bool samples8 = tex->buffer.b.b.nr_storage_samples == 8;
|
||||
bool is_array = tex->buffer.b.b.array_size > 1;
|
||||
void **shader = &sctx->cs_clear_dcc_msaa[swizzle_mode][bpe_log2][samples8][is_array];
|
||||
|
||||
if (!*shader)
|
||||
*shader = gfx9_create_clear_dcc_msaa_cs(sctx, tex);
|
||||
ctx->bind_compute_state(ctx, *shader);
|
||||
|
||||
/* Dispatch compute. */
|
||||
unsigned width = DIV_ROUND_UP(tex->buffer.b.b.width0, tex->surface.u.gfx9.color.dcc_block_width);
|
||||
unsigned height = DIV_ROUND_UP(tex->buffer.b.b.height0, tex->surface.u.gfx9.color.dcc_block_height);
|
||||
unsigned depth = DIV_ROUND_UP(tex->buffer.b.b.array_size, tex->surface.u.gfx9.color.dcc_block_depth);
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = 8;
|
||||
info.block[1] = 8;
|
||||
info.block[2] = 1;
|
||||
info.last_block[0] = width % info.block[0];
|
||||
info.last_block[1] = height % info.block[1];
|
||||
info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
|
||||
info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
|
||||
info.grid[2] = depth;
|
||||
|
||||
si_launch_grid_internal(sctx, &info, saved_cs, flags);
|
||||
|
||||
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, tex->surface.meta_size);
|
||||
|
||||
if (flags & SI_OP_SYNC_AFTER)
|
||||
sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
|
||||
|
||||
/* Restore states. */
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
|
||||
pipe_resource_reference(&saved_sb.buffer, NULL);
|
||||
}
|
||||
|
||||
/* Expand FMASK to make it identity, so that image stores can ignore it. */
|
||||
void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
|
||||
{
|
||||
|
|
|
@ -268,6 +268,17 @@ static void si_destroy_context(struct pipe_context *context)
|
|||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) {
|
||||
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) {
|
||||
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) {
|
||||
for (unsigned l = 0; l < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j][k]); l++) {
|
||||
if (sctx->cs_clear_dcc_msaa[i][j][k][l])
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_dcc_msaa[i][j][k][l]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sctx->blitter)
|
||||
util_blitter_destroy(sctx->blitter);
|
||||
|
||||
|
|
|
@ -1316,6 +1316,10 @@ struct si_context {
|
|||
bool thread_trace_enabled;
|
||||
|
||||
unsigned context_flags;
|
||||
|
||||
/* Shaders. */
|
||||
/* TODO: move other shaders here too */
|
||||
void *cs_clear_dcc_msaa[32][5][2][2]; /* [swizzle_mode][log2(bpe)][samples == 8][is_array] */
|
||||
};
|
||||
|
||||
/* si_blit.c */
|
||||
|
@ -1368,6 +1372,7 @@ struct si_clear_info {
|
|||
uint32_t size;
|
||||
uint32_t clear_value;
|
||||
uint32_t writemask;
|
||||
bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */
|
||||
};
|
||||
|
||||
enum pipe_format si_simplify_cb_format(enum pipe_format format);
|
||||
|
@ -1423,6 +1428,8 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
|
|||
unsigned dsty, unsigned width, unsigned height,
|
||||
bool render_condition_enabled);
|
||||
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
|
||||
void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
|
||||
unsigned flags, enum si_coherency coher);
|
||||
void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
|
||||
void si_init_compute_blit_functions(struct si_context *sctx);
|
||||
|
||||
|
@ -1539,6 +1546,7 @@ void si_resume_queries(struct si_context *sctx);
|
|||
|
||||
/* si_shaderlib_nir.c */
|
||||
void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
|
||||
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
|
||||
|
||||
/* si_shaderlib_tgsi.c */
|
||||
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
||||
|
|
|
@ -100,3 +100,49 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
|
|||
|
||||
return create_nir_cs(sctx, &b);
|
||||
}
|
||||
|
||||
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
|
||||
{
|
||||
const nir_shader_compiler_options *options =
|
||||
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
|
||||
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_dcc_msaa");
|
||||
b.shader->info.cs.local_size[0] = 8;
|
||||
b.shader->info.cs.local_size[1] = 8;
|
||||
b.shader->info.cs.local_size[2] = 1;
|
||||
b.shader->info.cs.user_data_components_amd = 2;
|
||||
b.shader->info.num_ssbos = 1;
|
||||
|
||||
/* Get user data SGPRs. */
|
||||
nir_ssa_def *user_sgprs = nir_load_user_data_amd(&b);
|
||||
nir_ssa_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor;
|
||||
unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height);
|
||||
unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor);
|
||||
clear_value = nir_u2u16(&b, clear_value);
|
||||
|
||||
/* Get the 2D coordinates. */
|
||||
nir_ssa_def *coord = get_global_ids(&b, 3);
|
||||
nir_ssa_def *zero = nir_imm_int(&b, 0);
|
||||
|
||||
/* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
|
||||
coord = nir_imul(&b, coord,
|
||||
nir_channels(&b, nir_imm_ivec4(&b, tex->surface.u.gfx9.color.dcc_block_width,
|
||||
tex->surface.u.gfx9.color.dcc_block_height,
|
||||
tex->surface.u.gfx9.color.dcc_block_depth, 0), 0x7));
|
||||
|
||||
nir_ssa_def *offset =
|
||||
ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe,
|
||||
&tex->surface.u.gfx9.color.dcc_equation,
|
||||
dcc_pitch, dcc_height, zero, /* DCC slice size */
|
||||
nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
|
||||
tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */
|
||||
zero, pipe_xor); /* sample, pipe_xor */
|
||||
|
||||
/* The trick here is that DCC elements for an even and the next odd sample are next to each other
|
||||
* in memory, so we only need to compute the address for sample 0 and the next DCC byte is always
|
||||
* sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time.
|
||||
*/
|
||||
nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
|
||||
|
||||
return create_nir_cs(sctx, &b);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue