nir: split fuse_ffma into fuse_ffma16/32/64

AMD wants different behavior for each bit size

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6756>
This commit is contained in:
Marek Olšák 2020-09-16 21:48:18 -04:00 committed by Marge Bot
parent c58b46edf0
commit 21174dedec
10 changed files with 39 additions and 13 deletions

View File

@ -3056,7 +3056,9 @@ typedef enum {
typedef struct nir_shader_compiler_options {
bool lower_fdiv;
bool lower_ffma;
bool fuse_ffma;
bool fuse_ffma16;
bool fuse_ffma32;
bool fuse_ffma64;
bool lower_flrp16;
bool lower_flrp32;
/** Lowers flrp when it does not support doubles */

View File

@ -195,7 +195,9 @@ optimizations.extend([
(('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
(('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
# Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
(('~ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma'),
(('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
(('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
(('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
(('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
@ -2028,7 +2030,9 @@ late_optimizations = [
(('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
(('ineg', a), ('isub', 0, a), 'options->lower_negate'),
(('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
(('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
(('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
(('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
# These are duplicated from the main optimizations table. The late
# patterns that rearrange expressions like x - .5 < 0 to x < .5 can create

View File

@ -47,7 +47,9 @@ static const nir_shader_compiler_options options = {
.lower_usub_borrow = true,
.lower_mul_high = true,
.lower_mul_2x32_64 = true,
.fuse_ffma = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
.vertex_id_zero_based = true,
.lower_extract_byte = true,
.lower_extract_word = true,
@ -97,7 +99,9 @@ static const nir_shader_compiler_options options_a6xx = {
.lower_usub_borrow = true,
.lower_mul_high = true,
.lower_mul_2x32_64 = true,
.fuse_ffma = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
.vertex_id_zero_based = false,
.lower_extract_byte = true,
.lower_extract_word = true,

View File

@ -1004,7 +1004,9 @@ etna_screen_create(struct etna_device *dev, struct etna_gpu *gpu,
.lower_fpow = true,
.lower_sub = true,
.lower_ftrunc = true,
.fuse_ffma = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
.lower_bitops = true,
.lower_all_io_to_temps = true,
.vertex_id_zero_based = true,

View File

@ -35,7 +35,9 @@ static const nir_shader_compiler_options options = {
.lower_fmod = true,
.lower_fdiv = true,
.lower_fceil = true,
.fuse_ffma = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
/* .fdot_replicates = true, it is replicated, but it makes things worse */
.lower_all_io_to_temps = true,
.vertex_id_zero_based = true, /* its not implemented anyway */

View File

@ -3207,7 +3207,9 @@ nvir_nir_shader_compiler_options(int chipset)
nir_shader_compiler_options op = {};
op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET);
op.lower_ffma = false;
op.fuse_ffma = false; /* nir doesn't track mad vs fma */
op.fuse_ffma16 = false; /* nir doesn't track mad vs fma */
op.fuse_ffma32 = false; /* nir doesn't track mad vs fma */
op.fuse_ffma64 = false; /* nir doesn't track mad vs fma */
op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET);
op.lower_flrp32 = true;
op.lower_flrp64 = true;

View File

@ -923,7 +923,9 @@ int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
}
static const nir_shader_compiler_options nir_options = {
.fuse_ffma = false, /* nir doesn't track mad vs fma */
.fuse_ffma16 = false, /* nir doesn't track mad vs fma */
.fuse_ffma32 = false, /* nir doesn't track mad vs fma */
.fuse_ffma64 = false, /* nir doesn't track mad vs fma */
.lower_flrp32 = true,
.lower_flrp64 = true,
.lower_fpow = false,

View File

@ -1179,7 +1179,9 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
}
const struct nir_shader_compiler_options r600_nir_fs_options = {
.fuse_ffma = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
.lower_scmp = true,
.lower_flrp32 = true,
.lower_flrp64 = true,
@ -1203,7 +1205,9 @@ const struct nir_shader_compiler_options r600_nir_fs_options = {
};
const struct nir_shader_compiler_options r600_nir_options = {
.fuse_ffma = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
.lower_scmp = true,
.lower_flrp32 = true,
.lower_flrp64 = true,

View File

@ -946,7 +946,9 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
* for gfx10.3 on gfx10.
*/
.lower_ffma = sscreen->info.chip_class <= GFX9,
.fuse_ffma = sscreen->info.chip_class >= GFX10,
.fuse_ffma16 = sscreen->info.chip_class >= GFX10,
.fuse_ffma32 = sscreen->info.chip_class >= GFX10,
.fuse_ffma64 = sscreen->info.chip_class >= GFX10,
.lower_fmod = true,
.lower_pack_snorm_4x8 = true,
.lower_pack_unorm_4x8 = true,

View File

@ -69,7 +69,9 @@ static const nir_shader_compiler_options bifrost_nir_options = {
.lower_bitfield_extract_to_shifts = true,
.vectorize_io = true,
.fuse_ffma = true,
.fuse_ffma16 = true,
.fuse_ffma32 = true,
.fuse_ffma64 = true,
.use_interpolated_input_intrinsics = true
};