diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index bda531ce8a3..6040ec78572 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11046,10 +11046,16 @@ struct mrt_color_export { unsigned write_mask; Operand values[4]; uint8_t col_format; + + /* Fields below are only used for PS epilogs. */ + bool is_int8; + bool is_int10; + bool enable_mrt_output_nan_fixup; }; static bool -export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out) +export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out, + bool is_ps_epilog) { Builder bld(ctx->program, ctx->block); Operand values[4]; @@ -11060,10 +11066,24 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out) unsigned target; unsigned enabled_channels = 0; + aco_opcode compr_op = aco_opcode::num_opcodes; bool compr = false; target = V_008DFC_SQ_EXP_MRT + out->slot; + /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ + if (out->enable_mrt_output_nan_fixup && + (out->col_format == V_028714_SPI_SHADER_32_R || out->col_format == V_028714_SPI_SHADER_32_GR || + out->col_format == V_028714_SPI_SHADER_32_AR || out->col_format == V_028714_SPI_SHADER_32_ABGR || + out->col_format == V_028714_SPI_SHADER_FP16_ABGR)) { + u_foreach_bit(i, out->write_mask) { + Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.def(bld.lm), values[i], + bld.copy(bld.def(v1), Operand::c32(3u))); + values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i], + bld.copy(bld.def(v1), Operand::zero()), isnan); + } + } + switch (out->col_format) { case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break; @@ -11081,21 +11101,116 @@ export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export *out) break; case V_028714_SPI_SHADER_FP16_ABGR: - case V_028714_SPI_SHADER_UNORM16_ABGR: - case V_028714_SPI_SHADER_SNORM16_ABGR: - case V_028714_SPI_SHADER_UINT16_ABGR: - case V_028714_SPI_SHADER_SINT16_ABGR: - enabled_channels = util_widen_mask(out->write_mask, 2); + if (is_ps_epilog) { + for (int i = 0; i < 2; i++) { + bool enabled = (out->write_mask >> (i * 2)) & 0x3; + if (enabled) { + enabled_channels |= 0x3 << (i * 2); + if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) { + values[i] = + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), + values[i * 2].isUndefined() ? Operand::zero() : values[i * 2], + values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]); + } else { + values[i] = + bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), + values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2], + values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]); + } + } else { + values[i] = Operand(v1); + } + } + values[2] = Operand(v1); + values[3] = Operand(v1); + } else { + enabled_channels = util_widen_mask(out->write_mask, 2); + } compr = true; break; + case V_028714_SPI_SHADER_UNORM16_ABGR: + if (is_ps_epilog) { + compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + } else { + enabled_channels = util_widen_mask(out->write_mask, 2); + compr = true; + } + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + if (is_ps_epilog) { + compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + } else { + enabled_channels = util_widen_mask(out->write_mask, 2); + compr = true; + } + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: + if (is_ps_epilog) { + compr_op = aco_opcode::v_cvt_pk_u16_u32; + if (out->is_int8 || out->is_int10) { + /* clamp */ + uint32_t max_rgb = out->is_int8 ? 255 : out->is_int10 ? 1023 : 0; + + u_foreach_bit(i, out->write_mask) { + uint32_t max = i == 3 && out->is_int10 ? 3 : max_rgb; + + values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]); + } + } + } else { + enabled_channels = util_widen_mask(out->write_mask, 2); + compr = true; + } + break; + + case V_028714_SPI_SHADER_SINT16_ABGR: + if (is_ps_epilog) { + compr_op = aco_opcode::v_cvt_pk_i16_i32; + if (out->is_int8 || out->is_int10) { + /* clamp */ + uint32_t max_rgb = out->is_int8 ? 127 : out->is_int10 ? 511 : 0; + uint32_t min_rgb = out->is_int8 ? -128 : out->is_int10 ? -512 : 0; + + u_foreach_bit(i, out->write_mask) { + uint32_t max = i == 3 && out->is_int10 ? 1 : max_rgb; + uint32_t min = i == 3 && out->is_int10 ? -2u : min_rgb; + + values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]); + values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]); + } + } + } else { + enabled_channels = util_widen_mask(out->write_mask, 2); + compr = true; + } + break; + case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break; case V_028714_SPI_SHADER_ZERO: default: return false; } - if (!compr) { + if (compr_op != aco_opcode::num_opcodes) { + for (int i = 0; i < 2; i++) { + /* check if at least one of the values to be compressed is enabled */ + bool enabled = (out->write_mask >> (i * 2)) & 0x3; + if (enabled) { + enabled_channels |= 0x3 << (i * 2); + values[i] = bld.vop3( + compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2], + values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]); + } else { + values[i] = Operand(v1); + } + } + values[2] = Operand(v1); + values[3] = Operand(v1); + compr = true; + } else if (!compr) { for (int i = 0; i < 4; i++) values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); } @@ -11210,7 +11325,7 @@ create_fs_exports(isel_context* ctx) } } - exported |= export_fs_mrt_color(ctx, &out); + exported |= export_fs_mrt_color(ctx, &out, false); } if (!exported) @@ -11683,7 +11798,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const const struct aco_shader_info* info, const struct radv_shader_args* args) { - isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false); + isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false, false); if_context ic_merged_wave_info; bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS); @@ -11812,7 +11927,7 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_ const struct aco_shader_info* info, const struct radv_shader_args* args) { - isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, options, info, args, true); + isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, options, info, args, true, false); ctx.block->fp_mode = program->next_fp_mode; @@ -12293,4 +12408,58 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs); program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs); } + +void +select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, ac_shader_config* config, + const struct aco_compiler_options* options, + const struct aco_shader_info* info, + const struct radv_shader_args* args) +{ + isel_context ctx = setup_isel_context(program, 0, NULL, config, options, info, args, false, true); + + ctx.block->fp_mode = program->next_fp_mode; + + add_startpgm(&ctx); + append_logical_start(ctx.block); + + Builder bld(ctx.program, ctx.block); + + /* Export all color render targets */ + bool exported = false; + + for (unsigned i = 0; i < 8; i++) { + unsigned col_format = (key->spi_shader_col_format >> (i * 4)) & 0xf; + + if (col_format == V_028714_SPI_SHADER_ZERO) + continue; + + struct mrt_color_export out; + + out.slot = i; + out.write_mask = 0xf; + out.col_format = col_format; + out.is_int8 = (key->color_is_int8 >> i) & 1; + out.is_int10 = (key->color_is_int10 >> i) & 1; + out.enable_mrt_output_nan_fixup = (key->enable_mrt_output_nan_fixup >> i) & 1; + + Temp inputs = get_arg(&ctx, ctx.args->ps_epilog_inputs[i]); + for (unsigned c = 0; c < 4; ++c) { + out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1)); + } + + exported |= export_fs_mrt_color(&ctx, &out, true); + } + + if (!exported) + create_fs_null_export(&ctx); + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_export_end; + bld.reset(ctx.block); + bld.sopp(aco_opcode::s_endpgm); + + cleanup_cfg(program); +} } // namespace aco diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index d700f61ae69..3b775901aaa 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -128,7 +128,8 @@ isel_context setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders, ac_shader_config* config, const struct aco_compiler_options* options, const struct aco_shader_info* info, - const struct radv_shader_args* args, bool is_gs_copy_shader); + const struct radv_shader_args* args, + bool is_gs_copy_shader, bool is_ps_epilog); } // namespace aco diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 719c5b8b978..d61620cb7d0 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -825,7 +825,8 @@ isel_context setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders, ac_shader_config* config, const struct aco_compiler_options* options, const struct aco_shader_info* info, - const struct radv_shader_args* args, bool is_gs_copy_shader) + const struct radv_shader_args* args, bool is_gs_copy_shader, + bool is_ps_epilog) { SWStage sw_stage = SWStage::None; for (unsigned i = 0; i < shader_count; i++) { @@ -843,6 +844,12 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c default: unreachable("Shader stage not implemented"); } } + + if (is_ps_epilog) { + assert(shader_count == 0 && !shaders); + sw_stage = SWStage::FS; + } + bool gfx9_plus = options->gfx_level >= GFX9; bool ngg = info->is_ngg && options->gfx_level >= GFX10; HWStage hw_stage{}; diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index a115a16d9bc..f98a2375390 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -314,3 +314,48 @@ aco_compile_vs_prolog(const struct aco_compiler_options* options, disasm.data(), disasm.size()); } + +void +aco_compile_ps_epilog(const struct aco_compiler_options* options, + const struct aco_shader_info* info, + const struct aco_ps_epilog_key* key, + const struct radv_shader_args* args, + aco_shader_part_callback* build_epilog, + void** binary) +{ + aco::init(); + + ac_shader_config config = {0}; + std::unique_ptr program{new aco::Program}; + + program->collect_statistics = options->record_stats; + if (program->collect_statistics) + memset(program->statistics, 0, sizeof(program->statistics)); + + program->debug.func = options->debug.func; + program->debug.private_data = options->debug.private_data; + + /* Instruction selection */ + aco::select_ps_epilog(program.get(), key, &config, options, info, args); + + aco_postprocess_shader(options, args, program); + + /* assembly */ + std::vector code; + unsigned exec_size = aco::emit_program(program.get(), code); + + bool get_disasm = options->dump_shader || options->record_ir; + + std::string disasm; + if (get_disasm) + disasm = get_disasm_string(program.get(), code, exec_size); + + (*build_epilog)(binary, + config.num_sgprs, + config.num_vgprs, + 0, + code.data(), + code.size(), + disasm.data(), + disasm.size()); +} diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h index 94afa080c98..c404038c42d 100644 --- a/src/amd/compiler/aco_interface.h +++ b/src/amd/compiler/aco_interface.h @@ -84,6 +84,13 @@ void aco_compile_vs_prolog(const struct aco_compiler_options* options, aco_shader_part_callback *build_prolog, void **binary); +void aco_compile_ps_epilog(const struct aco_compiler_options* options, + const struct aco_shader_info* info, + const struct aco_ps_epilog_key* key, + const struct radv_shader_args* args, + aco_shader_part_callback* build_epilog, + void** binary); + uint64_t aco_get_codegen_flags(); #ifdef __cplusplus diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index f2721aa0d95..968ad1cf274 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2206,6 +2206,12 @@ void select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, const struct radv_shader_args* args, unsigned* num_preserved_sgprs); +void select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, + ac_shader_config* config, + const struct aco_compiler_options* options, + const struct aco_shader_info* info, + const struct radv_shader_args* args); + void lower_phis(Program* program); void calc_min_waves(Program* program); void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);