radeonsi: move smoothing to the main shader part to remove 1 live VGPR

The samplemask VGPR that we had to pass to the epilog increased VGPR usage
by 1 for all shaders. Do it in the main function by using the mono key
structure, which causes on-demand compilation and stall, but we'll save
the VGPR.

57794 shaders in 35145 tests
Totals:
SGPRS: 2715856 -> 2716272 (0.02 %)
VGPRS: 1776168 -> 1718432 (-3.25 %)
Spilled SGPRs: 3704 -> 3630 (-2.00 %)
Spilled VGPRs: 1727 -> 1733 (0.35 %)
Private memory VGPRs: 256 -> 256 (0.00 %)
Scratch size: 2008 -> 2016 (0.40 %) dwords per thread
Code Size: 61429584 -> 61393288 (-0.06 %) bytes
Max Waves: 838645 -> 840484 (0.22 %)

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14266>
This commit is contained in:
Marek Olšák 2021-12-13 14:36:37 -05:00 committed by Marge Bot
parent 12b942bd16
commit 198ad7e4dc
6 changed files with 26 additions and 48 deletions

View File

@ -701,8 +701,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
shader->selector->info.writes_z + shader->selector->info.writes_stencil +
shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
for (i = 0; i < num_return_sgprs; i++)
ac_add_return(&ctx->args, AC_ARG_SGPR);
for (; i < num_returns; i++)
@ -1249,9 +1247,8 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
fprintf(f, " epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
fprintf(f, " epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
fprintf(f, " epilog.poly_line_smoothing = %u\n",
key->ps.part.epilog.poly_line_smoothing);
fprintf(f, " epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
fprintf(f, " mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
fprintf(f, " mono.interpolate_at_sample_force_center = %u\n",
key->ps.mono.interpolate_at_sample_force_center);
fprintf(f, " mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
@ -1986,12 +1983,6 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_
assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
}
/* The sample mask input is always enabled, because the API shader always
* passes it through to the epilog. Disable it here if it's unused.
*/
if (!shader->key.ps.part.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
return true;
}

View File

@ -578,7 +578,6 @@ struct si_ps_epilog_bits {
unsigned last_cbuf : 3;
unsigned alpha_func : 3;
unsigned alpha_to_one : 1;
unsigned poly_line_smoothing : 1;
unsigned clamp_color : 1;
};
@ -708,6 +707,7 @@ struct si_shader_key_ps {
/* Flags for monolithic compilation only. */
struct {
unsigned poly_line_smoothing : 1;
unsigned interpolate_at_sample_force_center : 1;
unsigned fbfetch_msaa : 1;
unsigned fbfetch_is_1D : 1;

View File

@ -30,11 +30,6 @@
struct pipe_debug_callback;
/* Ideally pass the sample mask input to the PS epilog as v14, which
* is its usual location, so that the shader doesn't have to add v_mov.
*/
#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
struct si_shader_output_values {
LLVMValueRef values[4];
ubyte vertex_stream[4];

View File

@ -221,23 +221,17 @@ static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
}
}
static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, LLVMValueRef alpha,
unsigned samplemask_param)
static LLVMValueRef si_get_coverage_from_sample_mask(struct si_shader_context *ctx)
{
LLVMValueRef coverage;
/* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
coverage = LLVMGetParam(ctx->main_fn, samplemask_param);
coverage = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
coverage = ac_build_bit_count(&ctx->ac, ac_to_integer(&ctx->ac, coverage));
coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, ctx->ac.f32, "");
coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
if (LLVMTypeOf(alpha) == ctx->ac.f16)
coverage = LLVMBuildFPTrunc(ctx->ac.builder, coverage, ctx->ac.f16, "");
return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
return LLVMBuildFMul(ctx->ac.builder, coverage,
LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
}
struct si_ps_exports {
@ -407,8 +401,8 @@ static bool si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue
}
static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
unsigned first_color_export, unsigned samplemask_param,
unsigned color_type, struct si_ps_exports *exp)
unsigned first_color_export, unsigned color_type,
struct si_ps_exports *exp)
{
int i;
@ -425,10 +419,6 @@ static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
si_alpha_test(ctx, color[3]);
/* Line & polygon smoothing */
if (ctx->shader->key.ps.part.epilog.poly_line_smoothing)
color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], samplemask_param);
/* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
if (ctx->shader->key.ps.part.epilog.last_cbuf > 0) {
assert(exp->num == first_color_export);
@ -470,7 +460,7 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
struct si_shader *shader = ctx->shader;
struct si_shader_info *info = &shader->selector->info;
LLVMBuilderRef builder = ctx->ac.builder;
unsigned i, j, first_vgpr, vgpr;
unsigned i, j, vgpr;
LLVMValueRef *addrs = abi->outputs;
LLVMValueRef color[8][4] = {};
@ -507,6 +497,10 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
}
}
LLVMValueRef smoothing_coverage = NULL;
if (ctx->shader->key.ps.mono.poly_line_smoothing)
smoothing_coverage = si_get_coverage_from_sample_mask(ctx);
/* Fill the return structure. */
ret = ctx->return_value;
@ -516,12 +510,17 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
SI_SGPR_ALPHA_REF, "");
/* Set VGPRs */
first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
vgpr = SI_SGPR_ALPHA_REF + 1;
for (i = 0; i < ARRAY_SIZE(color); i++) {
if (!color[i][0])
continue;
if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
if (smoothing_coverage) {
color[i][3] = LLVMBuildFMul(builder, color[i][3],
LLVMBuildFPTrunc(builder, smoothing_coverage, ctx->ac.f16, ""), "");
}
for (j = 0; j < 2; j++) {
LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
@ -529,6 +528,9 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
}
vgpr += 2;
} else {
if (smoothing_coverage)
color[i][3] = LLVMBuildFMul(builder, color[i][3], smoothing_coverage, "");
for (j = 0; j < 4; j++)
ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
}
@ -540,12 +542,6 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
if (samplemask)
ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
/* Add the input sample mask for smoothing at the end. */
if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
ret = LLVMBuildInsertValue(builder, ret, LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE),
vgpr++, "");
ctx->return_value = ret;
}
@ -860,9 +856,6 @@ void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part
ctx->args.num_sgprs_used + util_bitcount(key->ps_epilog.colors_written) * 4 +
key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask;
required_num_params =
MAX2(required_num_params, ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
while (ctx->args.arg_count < required_num_params)
ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
@ -911,8 +904,7 @@ void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part
color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
}
si_export_mrt_color(ctx, color, output_index, first_color_export,
ctx->args.arg_count - 1, color_type, &exp);
si_export_mrt_color(ctx, color, output_index, first_color_export, color_type, &exp);
}
if (exp.num) {

View File

@ -275,8 +275,8 @@ static bool si_update_shaders(struct si_context *sctx)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
if (sctx->smoothing_enabled !=
sctx->shader.ps.current->key.ps.part.epilog.poly_line_smoothing) {
sctx->smoothing_enabled = sctx->shader.ps.current->key.ps.part.epilog.poly_line_smoothing;
sctx->shader.ps.current->key.ps.mono.poly_line_smoothing) {
sctx->smoothing_enabled = sctx->shader.ps.current->key.ps.mono.poly_line_smoothing;
si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
/* NGG cull state uses smoothing_enabled. */

View File

@ -2196,7 +2196,7 @@ static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_co
bool is_line = util_prim_is_lines(sctx->current_rast_prim);
key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
key->ps.part.epilog.poly_line_smoothing =
key->ps.mono.poly_line_smoothing =
((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
sctx->framebuffer.nr_samples <= 1;
}