nir/flrp: Lower flrp(#a, #b, c) differently

If the magnitudes of #a and #b are such that (b-a) won't lose too much
precision, lower as a+c(b-a).

No changes on any other Intel platforms.

v2: Rebase on 424372e5dd5 ("nir: Use the flrp lowering pass instead of
nir_opt_algebraic")

Iron Lake and GM45 had similar results. (Iron Lake shown)
total instructions in shared programs: 8192503 -> 8192383 (<.01%)
instructions in affected programs: 18417 -> 18297 (-0.65%)
helped: 68
HURT: 0
helped stats (abs) min: 1 max: 18 x̄: 1.76 x̃: 1
helped stats (rel) min: 0.19% max: 7.89% x̄: 1.10% x̃: 0.43%
95% mean confidence interval for instructions value: -2.48 -1.05
95% mean confidence interval for instructions %-change: -1.56% -0.63%
Instructions are helped.

total cycles in shared programs: 188662536 -> 188661956 (<.01%)
cycles in affected programs: 744476 -> 743896 (-0.08%)
helped: 62
HURT: 0
helped stats (abs) min: 4 max: 60 x̄: 9.35 x̃: 6
helped stats (rel) min: 0.02% max: 4.84% x̄: 0.27% x̃: 0.06%
95% mean confidence interval for cycles value: -12.37 -6.34
95% mean confidence interval for cycles %-change: -0.48% -0.06%
Cycles are helped.

Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
Ian Romanick 2018-08-18 16:49:48 -07:00
parent dd7135d55d
commit 23c5501b77
1 changed files with 68 additions and 0 deletions

View File

@ -20,6 +20,7 @@
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <math.h>
#include "nir.h"
#include "nir_builder.h"
#include "util/u_vector.h"
@ -136,6 +137,58 @@ replace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp,
append_flrp_to_dead_list(dead_flrp, alu);
}
static bool
sources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr)
{
nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src);
nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src);
if (val0 == NULL || val1 == NULL)
return false;
const uint8_t *const swizzle0 = instr->src[0].swizzle;
const uint8_t *const swizzle1 = instr->src[1].swizzle;
const unsigned num_components = nir_dest_num_components(instr->dest.dest);
if (instr->dest.dest.ssa.bit_size == 32) {
for (unsigned i = 0; i < num_components; i++) {
int exp0;
int exp1;
frexpf(val0[swizzle0[i]].f32, &exp0);
frexpf(val1[swizzle1[i]].f32, &exp1);
/* If the difference between exponents is >= 24, then A+B will always
* have the value whichever between A and B has the largest absolute
* value. So, [0, 23] is the valid range. The smaller the limit
* value, the more precision will be maintained at a potential
* performance cost. Somewhat arbitrarilly split the range in half.
*/
if (abs(exp0 - exp1) > (23 / 2))
return false;
}
} else {
for (unsigned i = 0; i < num_components; i++) {
int exp0;
int exp1;
frexp(val0[swizzle0[i]].f64, &exp0);
frexp(val1[swizzle1[i]].f64, &exp1);
/* If the difference between exponents is >= 53, then A+B will always
* have the value whichever between A and B has the largest absolute
* value. So, [0, 52] is the valid range. The smaller the limit
* value, the more precision will be maintained at a potential
* performance cost. Somewhat arbitrarilly split the range in half.
*/
if (abs(exp0 - exp1) > (52 / 2))
return false;
}
}
return true;
}
static void
convert_flrp_instruction(nir_builder *bld,
struct u_vector *dead_flrp,
@ -197,6 +250,21 @@ convert_flrp_instruction(nir_builder *bld,
return;
}
/*
* - If x and y are both immediates and the relative magnitude of the
* values is similar (such that x-y does not lose too much precision):
*
* x + t(x - y)
*
* We rely on constant folding to eliminate x-y, and we rely on
* nir_opt_algebraic to possibly generate an FMA. The cost is either one
* FMA or two instructions.
*/
if (sources_are_constants_with_similar_magnitudes(alu)) {
replace_with_fast(bld, dead_flrp, alu);
return;
}
if (have_ffma) {
if (always_precise) {
replace_with_strict_ffma(bld, dead_flrp, alu);