nir/flrp: Lower flrp(#a, #b, c) differently
If the magnitudes of #a and #b are such that (b-a) won't lose too much precision, lower as a+c(b-a). No changes on any other Intel platforms. v2: Rebase on 424372e5dd5 ("nir: Use the flrp lowering pass instead of nir_opt_algebraic") Iron Lake and GM45 had similar results. (Iron Lake shown) total instructions in shared programs: 8192503 -> 8192383 (<.01%) instructions in affected programs: 18417 -> 18297 (-0.65%) helped: 68 HURT: 0 helped stats (abs) min: 1 max: 18 x̄: 1.76 x̃: 1 helped stats (rel) min: 0.19% max: 7.89% x̄: 1.10% x̃: 0.43% 95% mean confidence interval for instructions value: -2.48 -1.05 95% mean confidence interval for instructions %-change: -1.56% -0.63% Instructions are helped. total cycles in shared programs: 188662536 -> 188661956 (<.01%) cycles in affected programs: 744476 -> 743896 (-0.08%) helped: 62 HURT: 0 helped stats (abs) min: 4 max: 60 x̄: 9.35 x̃: 6 helped stats (rel) min: 0.02% max: 4.84% x̄: 0.27% x̃: 0.06% 95% mean confidence interval for cycles value: -12.37 -6.34 95% mean confidence interval for cycles %-change: -0.48% -0.06% Cycles are helped. Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
parent
dd7135d55d
commit
23c5501b77
|
@ -20,6 +20,7 @@
|
|||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
#include <math.h>
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
#include "util/u_vector.h"
|
||||
|
@ -136,6 +137,58 @@ replace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp,
|
|||
append_flrp_to_dead_list(dead_flrp, alu);
|
||||
}
|
||||
|
||||
static bool
|
||||
sources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr)
|
||||
{
|
||||
nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src);
|
||||
nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src);
|
||||
|
||||
if (val0 == NULL || val1 == NULL)
|
||||
return false;
|
||||
|
||||
const uint8_t *const swizzle0 = instr->src[0].swizzle;
|
||||
const uint8_t *const swizzle1 = instr->src[1].swizzle;
|
||||
const unsigned num_components = nir_dest_num_components(instr->dest.dest);
|
||||
|
||||
if (instr->dest.dest.ssa.bit_size == 32) {
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
int exp0;
|
||||
int exp1;
|
||||
|
||||
frexpf(val0[swizzle0[i]].f32, &exp0);
|
||||
frexpf(val1[swizzle1[i]].f32, &exp1);
|
||||
|
||||
/* If the difference between exponents is >= 24, then A+B will always
|
||||
* have the value whichever between A and B has the largest absolute
|
||||
* value. So, [0, 23] is the valid range. The smaller the limit
|
||||
* value, the more precision will be maintained at a potential
|
||||
* performance cost. Somewhat arbitrarilly split the range in half.
|
||||
*/
|
||||
if (abs(exp0 - exp1) > (23 / 2))
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
int exp0;
|
||||
int exp1;
|
||||
|
||||
frexp(val0[swizzle0[i]].f64, &exp0);
|
||||
frexp(val1[swizzle1[i]].f64, &exp1);
|
||||
|
||||
/* If the difference between exponents is >= 53, then A+B will always
|
||||
* have the value whichever between A and B has the largest absolute
|
||||
* value. So, [0, 52] is the valid range. The smaller the limit
|
||||
* value, the more precision will be maintained at a potential
|
||||
* performance cost. Somewhat arbitrarilly split the range in half.
|
||||
*/
|
||||
if (abs(exp0 - exp1) > (52 / 2))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
convert_flrp_instruction(nir_builder *bld,
|
||||
struct u_vector *dead_flrp,
|
||||
|
@ -197,6 +250,21 @@ convert_flrp_instruction(nir_builder *bld,
|
|||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* - If x and y are both immediates and the relative magnitude of the
|
||||
* values is similar (such that x-y does not lose too much precision):
|
||||
*
|
||||
* x + t(x - y)
|
||||
*
|
||||
* We rely on constant folding to eliminate x-y, and we rely on
|
||||
* nir_opt_algebraic to possibly generate an FMA. The cost is either one
|
||||
* FMA or two instructions.
|
||||
*/
|
||||
if (sources_are_constants_with_similar_magnitudes(alu)) {
|
||||
replace_with_fast(bld, dead_flrp, alu);
|
||||
return;
|
||||
}
|
||||
|
||||
if (have_ffma) {
|
||||
if (always_precise) {
|
||||
replace_with_strict_ffma(bld, dead_flrp, alu);
|
||||
|
|
Loading…
Reference in New Issue