aco: add support for compiling PS epilogs

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17485>
This commit is contained in:
Samuel Pitoiset 2022-07-14 18:53:46 +02:00 committed by Marge Bot
parent 8d13392969
commit 270cc39648
6 changed files with 247 additions and 12 deletions

View File

@ -11046,10 +11046,16 @@ struct mrt_color_export {
unsigned write_mask;
Operand values[4];
uint8_t col_format;
/* Fields below are only used for PS epilogs. */
bool is_int8;
bool is_int10;
bool enable_mrt_output_nan_fixup;
};
static bool
export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out)
export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out,
bool is_ps_epilog)
{
Builder bld(ctx->program, ctx->block);
Operand values[4];
@ -11060,10 +11066,24 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out)
unsigned target;
unsigned enabled_channels = 0;
aco_opcode compr_op = aco_opcode::num_opcodes;
bool compr = false;
target = V_008DFC_SQ_EXP_MRT + out->slot;
/* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
if (out->enable_mrt_output_nan_fixup &&
(out->col_format == V_028714_SPI_SHADER_32_R || out->col_format == V_028714_SPI_SHADER_32_GR ||
out->col_format == V_028714_SPI_SHADER_32_AR || out->col_format == V_028714_SPI_SHADER_32_ABGR ||
out->col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
u_foreach_bit(i, out->write_mask) {
Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.def(bld.lm), values[i],
bld.copy(bld.def(v1), Operand::c32(3u)));
values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
bld.copy(bld.def(v1), Operand::zero()), isnan);
}
}
switch (out->col_format) {
case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
@ -11081,21 +11101,116 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out)
break;
case V_028714_SPI_SHADER_FP16_ABGR:
case V_028714_SPI_SHADER_UNORM16_ABGR:
case V_028714_SPI_SHADER_SNORM16_ABGR:
case V_028714_SPI_SHADER_UINT16_ABGR:
case V_028714_SPI_SHADER_SINT16_ABGR:
enabled_channels = util_widen_mask(out->write_mask, 2);
if (is_ps_epilog) {
for (int i = 0; i < 2; i++) {
bool enabled = (out->write_mask >> (i * 2)) & 0x3;
if (enabled) {
enabled_channels |= 0x3 << (i * 2);
if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
values[i] =
bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
} else {
values[i] =
bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
}
} else {
values[i] = Operand(v1);
}
}
values[2] = Operand(v1);
values[3] = Operand(v1);
} else {
enabled_channels = util_widen_mask(out->write_mask, 2);
}
compr = true;
break;
case V_028714_SPI_SHADER_UNORM16_ABGR:
if (is_ps_epilog) {
compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
} else {
enabled_channels = util_widen_mask(out->write_mask, 2);
compr = true;
}
break;
case V_028714_SPI_SHADER_SNORM16_ABGR:
if (is_ps_epilog) {
compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
} else {
enabled_channels = util_widen_mask(out->write_mask, 2);
compr = true;
}
break;
case V_028714_SPI_SHADER_UINT16_ABGR:
if (is_ps_epilog) {
compr_op = aco_opcode::v_cvt_pk_u16_u32;
if (out->is_int8 || out->is_int10) {
/* clamp */
uint32_t max_rgb = out->is_int8 ? 255 : out->is_int10 ? 1023 : 0;
u_foreach_bit(i, out->write_mask) {
uint32_t max = i == 3 && out->is_int10 ? 3 : max_rgb;
values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
}
}
} else {
enabled_channels = util_widen_mask(out->write_mask, 2);
compr = true;
}
break;
case V_028714_SPI_SHADER_SINT16_ABGR:
if (is_ps_epilog) {
compr_op = aco_opcode::v_cvt_pk_i16_i32;
if (out->is_int8 || out->is_int10) {
/* clamp */
uint32_t max_rgb = out->is_int8 ? 127 : out->is_int10 ? 511 : 0;
uint32_t min_rgb = out->is_int8 ? -128 : out->is_int10 ? -512 : 0;
u_foreach_bit(i, out->write_mask) {
uint32_t max = i == 3 && out->is_int10 ? 1 : max_rgb;
uint32_t min = i == 3 && out->is_int10 ? -2u : min_rgb;
values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
}
}
} else {
enabled_channels = util_widen_mask(out->write_mask, 2);
compr = true;
}
break;
case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
case V_028714_SPI_SHADER_ZERO:
default: return false;
}
if (!compr) {
if (compr_op != aco_opcode::num_opcodes) {
for (int i = 0; i < 2; i++) {
/* check if at least one of the values to be compressed is enabled */
bool enabled = (out->write_mask >> (i * 2)) & 0x3;
if (enabled) {
enabled_channels |= 0x3 << (i * 2);
values[i] = bld.vop3(
compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
} else {
values[i] = Operand(v1);
}
}
values[2] = Operand(v1);
values[3] = Operand(v1);
compr = true;
} else if (!compr) {
for (int i = 0; i < 4; i++)
values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
}
@ -11210,7 +11325,7 @@ create_fs_exports(isel_context* ctx)
}
}
exported |= export_fs_mrt_color(ctx, &out);
exported |= export_fs_mrt_color(ctx, &out, false);
}
if (!exported)
@ -11683,7 +11798,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const
const struct aco_shader_info* info,
const struct radv_shader_args* args)
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false);
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false, false);
if_context ic_merged_wave_info;
bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
@ -11812,7 +11927,7 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_
const struct aco_shader_info* info,
const struct radv_shader_args* args)
{
isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, options, info, args, true);
isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, options, info, args, true, false);
ctx.block->fp_mode = program->next_fp_mode;
@ -12293,4 +12408,58 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
}
void
select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, ac_shader_config* config,
const struct aco_compiler_options* options,
const struct aco_shader_info* info,
const struct radv_shader_args* args)
{
isel_context ctx = setup_isel_context(program, 0, NULL, config, options, info, args, false, true);
ctx.block->fp_mode = program->next_fp_mode;
add_startpgm(&ctx);
append_logical_start(ctx.block);
Builder bld(ctx.program, ctx.block);
/* Export all color render targets */
bool exported = false;
for (unsigned i = 0; i < 8; i++) {
unsigned col_format = (key->spi_shader_col_format >> (i * 4)) & 0xf;
if (col_format == V_028714_SPI_SHADER_ZERO)
continue;
struct mrt_color_export out;
out.slot = i;
out.write_mask = 0xf;
out.col_format = col_format;
out.is_int8 = (key->color_is_int8 >> i) & 1;
out.is_int10 = (key->color_is_int10 >> i) & 1;
out.enable_mrt_output_nan_fixup = (key->enable_mrt_output_nan_fixup >> i) & 1;
Temp inputs = get_arg(&ctx, ctx.args->ps_epilog_inputs[i]);
for (unsigned c = 0; c < 4; ++c) {
out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1));
}
exported |= export_fs_mrt_color(&ctx, &out, true);
}
if (!exported)
create_fs_null_export(&ctx);
program->config->float_mode = program->blocks[0].fp_mode.val;
append_logical_end(ctx.block);
ctx.block->kind |= block_kind_export_end;
bld.reset(ctx.block);
bld.sopp(aco_opcode::s_endpgm);
cleanup_cfg(program);
}
} // namespace aco

View File

@ -128,7 +128,8 @@ isel_context setup_isel_context(Program* program, unsigned shader_count,
struct nir_shader* const* shaders, ac_shader_config* config,
const struct aco_compiler_options* options,
const struct aco_shader_info* info,
const struct radv_shader_args* args, bool is_gs_copy_shader);
const struct radv_shader_args* args,
bool is_gs_copy_shader, bool is_ps_epilog);
} // namespace aco

View File

@ -825,7 +825,8 @@ isel_context
setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
ac_shader_config* config, const struct aco_compiler_options* options,
const struct aco_shader_info* info,
const struct radv_shader_args* args, bool is_gs_copy_shader)
const struct radv_shader_args* args, bool is_gs_copy_shader,
bool is_ps_epilog)
{
SWStage sw_stage = SWStage::None;
for (unsigned i = 0; i < shader_count; i++) {
@ -843,6 +844,12 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
default: unreachable("Shader stage not implemented");
}
}
if (is_ps_epilog) {
assert(shader_count == 0 && !shaders);
sw_stage = SWStage::FS;
}
bool gfx9_plus = options->gfx_level >= GFX9;
bool ngg = info->is_ngg && options->gfx_level >= GFX10;
HWStage hw_stage{};

View File

@ -314,3 +314,48 @@ aco_compile_vs_prolog(const struct aco_compiler_options* options,
disasm.data(),
disasm.size());
}
void
aco_compile_ps_epilog(const struct aco_compiler_options* options,
const struct aco_shader_info* info,
const struct aco_ps_epilog_key* key,
const struct radv_shader_args* args,
aco_shader_part_callback* build_epilog,
void** binary)
{
aco::init();
ac_shader_config config = {0};
std::unique_ptr<aco::Program> program{new aco::Program};
program->collect_statistics = options->record_stats;
if (program->collect_statistics)
memset(program->statistics, 0, sizeof(program->statistics));
program->debug.func = options->debug.func;
program->debug.private_data = options->debug.private_data;
/* Instruction selection */
aco::select_ps_epilog(program.get(), key, &config, options, info, args);
aco_postprocess_shader(options, args, program);
/* assembly */
std::vector<uint32_t> code;
unsigned exec_size = aco::emit_program(program.get(), code);
bool get_disasm = options->dump_shader || options->record_ir;
std::string disasm;
if (get_disasm)
disasm = get_disasm_string(program.get(), code, exec_size);
(*build_epilog)(binary,
config.num_sgprs,
config.num_vgprs,
0,
code.data(),
code.size(),
disasm.data(),
disasm.size());
}

View File

@ -84,6 +84,13 @@ void aco_compile_vs_prolog(const struct aco_compiler_options* options,
aco_shader_part_callback *build_prolog,
void **binary);
void aco_compile_ps_epilog(const struct aco_compiler_options* options,
const struct aco_shader_info* info,
const struct aco_ps_epilog_key* key,
const struct radv_shader_args* args,
aco_shader_part_callback* build_epilog,
void** binary);
uint64_t aco_get_codegen_flags();
#ifdef __cplusplus

View File

@ -2206,6 +2206,12 @@ void select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key,
const struct radv_shader_args* args,
unsigned* num_preserved_sgprs);
void select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key,
ac_shader_config* config,
const struct aco_compiler_options* options,
const struct aco_shader_info* info,
const struct radv_shader_args* args);
void lower_phis(Program* program);
void calc_min_waves(Program* program);
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);