From 23c5501b77efdb1f071709c23ed21f64f8b9cb00 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Sat, 18 Aug 2018 16:49:48 -0700 Subject: [PATCH] nir/flrp: Lower flrp(#a, #b, c) differently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the magnitudes of #a and #b are such that (b-a) won't lose too much precision, lower as a+c(b-a). No changes on any other Intel platforms. v2: Rebase on 424372e5dd5 ("nir: Use the flrp lowering pass instead of nir_opt_algebraic") Iron Lake and GM45 had similar results. (Iron Lake shown) total instructions in shared programs: 8192503 -> 8192383 (<.01%) instructions in affected programs: 18417 -> 18297 (-0.65%) helped: 68 HURT: 0 helped stats (abs) min: 1 max: 18 x̄: 1.76 x̃: 1 helped stats (rel) min: 0.19% max: 7.89% x̄: 1.10% x̃: 0.43% 95% mean confidence interval for instructions value: -2.48 -1.05 95% mean confidence interval for instructions %-change: -1.56% -0.63% Instructions are helped. total cycles in shared programs: 188662536 -> 188661956 (<.01%) cycles in affected programs: 744476 -> 743896 (-0.08%) helped: 62 HURT: 0 helped stats (abs) min: 4 max: 60 x̄: 9.35 x̃: 6 helped stats (rel) min: 0.02% max: 4.84% x̄: 0.27% x̃: 0.06% 95% mean confidence interval for cycles value: -12.37 -6.34 95% mean confidence interval for cycles %-change: -0.48% -0.06% Cycles are helped. Reviewed-by: Matt Turner --- src/compiler/nir/nir_lower_flrp.c | 68 +++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/compiler/nir/nir_lower_flrp.c b/src/compiler/nir/nir_lower_flrp.c index 2d57998b41d..952068ec9cc 100644 --- a/src/compiler/nir/nir_lower_flrp.c +++ b/src/compiler/nir/nir_lower_flrp.c @@ -20,6 +20,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ +#include #include "nir.h" #include "nir_builder.h" #include "util/u_vector.h" @@ -136,6 +137,58 @@ replace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp, append_flrp_to_dead_list(dead_flrp, alu); } +static bool +sources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr) +{ + nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src); + nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src); + + if (val0 == NULL || val1 == NULL) + return false; + + const uint8_t *const swizzle0 = instr->src[0].swizzle; + const uint8_t *const swizzle1 = instr->src[1].swizzle; + const unsigned num_components = nir_dest_num_components(instr->dest.dest); + + if (instr->dest.dest.ssa.bit_size == 32) { + for (unsigned i = 0; i < num_components; i++) { + int exp0; + int exp1; + + frexpf(val0[swizzle0[i]].f32, &exp0); + frexpf(val1[swizzle1[i]].f32, &exp1); + + /* If the difference between exponents is >= 24, then A+B will always + * have the value whichever between A and B has the largest absolute + * value. So, [0, 23] is the valid range. The smaller the limit + * value, the more precision will be maintained at a potential + * performance cost. Somewhat arbitrarilly split the range in half. + */ + if (abs(exp0 - exp1) > (23 / 2)) + return false; + } + } else { + for (unsigned i = 0; i < num_components; i++) { + int exp0; + int exp1; + + frexp(val0[swizzle0[i]].f64, &exp0); + frexp(val1[swizzle1[i]].f64, &exp1); + + /* If the difference between exponents is >= 53, then A+B will always + * have the value whichever between A and B has the largest absolute + * value. So, [0, 52] is the valid range. The smaller the limit + * value, the more precision will be maintained at a potential + * performance cost. Somewhat arbitrarilly split the range in half. + */ + if (abs(exp0 - exp1) > (52 / 2)) + return false; + } + } + + return true; +} + static void convert_flrp_instruction(nir_builder *bld, struct u_vector *dead_flrp, @@ -197,6 +250,21 @@ convert_flrp_instruction(nir_builder *bld, return; } + /* + * - If x and y are both immediates and the relative magnitude of the + * values is similar (such that x-y does not lose too much precision): + * + * x + t(x - y) + * + * We rely on constant folding to eliminate x-y, and we rely on + * nir_opt_algebraic to possibly generate an FMA. The cost is either one + * FMA or two instructions. + */ + if (sources_are_constants_with_similar_magnitudes(alu)) { + replace_with_fast(bld, dead_flrp, alu); + return; + } + if (have_ffma) { if (always_precise) { replace_with_strict_ffma(bld, dead_flrp, alu);