pan/bi: Optimize abs(derivative)

We implement fine derivatives as: broadcast(x, (lane & ~1) + 1) - broadcast(x, lane & ~1) Most of the complexity is to get the right sign. If we can ignore the sign, we can generate the simpler code: broadcast(x, lane ^ 1) - lane This is a particular win on v7+ where the broadcast instruction (CLPER) can do `lane ^ value` for free. However, even on v6 where we lower to an explicit XOR instruction, it's still a win. The limiting case is fwidth. The fragment shader gl_FragColor = fwidth(vec4_varying); has the following results on v6, v7, and v9: G72 (-26% instructions, -43% cycles): 38 inst, 30 tuples, 5 clauses, 1.166667 cycles, 1.166667 arith, 28 quadwords 28 inst, 19 tuples, 4 clauses, 0.666667 cycles, 0.666667 arith, 19 quadwords G76 (-37% instructions, -54% cycles): 38 inst, 30 tuples, 5 clauses, 1.166667 cycles, 1.166667 arith, 28 quadwords 24 inst, 16 tuples, 4 clauses, 0.541667 cycles, 0.541667 arith, 18 quadwords G78 (-40% instructions, -56% cycles): 40 inst, 1.125000 cycles, 0.250000 fma, 0.109375 cvt, 1.125000 sfu, 20 quadwords 24 inst, 0.500000 cycles, 0.250000 fma, 0.015625 cvt, 0.500000 sfu, 12 quadwords shader-db tells a similar story -- most shaders are unaffected, but a shader that uses fwidth has a 20% reduction in cycle count: instructions helped: shaders/tesseract/488.shader_test MESA_SHADER_FRAGMENT: 264 -> 262 (-0.76%) instructions helped: shaders/chromeos/109-1.shader_test MESA_SHADER_FRAGMENT: 36 -> 28 (-22.22%) tuples helped: shaders/chromeos/109-1.shader_test MESA_SHADER_FRAGMENT: 27 -> 22 (-18.52%) tuples HURT: shaders/tesseract/488.shader_test MESA_SHADER_FRAGMENT: 211 -> 212 (0.47%) clauses HURT: shaders/tesseract/488.shader_test MESA_SHADER_FRAGMENT: 32 -> 33 (3.12%) cycles helped: shaders/chromeos/109-1.shader_test MESA_SHADER_FRAGMENT: 1 -> 0.79 (-20.83%) arith helped: shaders/chromeos/109-1.shader_test MESA_SHADER_FRAGMENT: 1 -> 0.79 (-20.83%) quadwords helped: shaders/chromeos/109-1.shader_test MESA_SHADER_FRAGMENT: 31 -> 28 (-9.68%) quadwords HURT: shaders/tesseract/488.shader_test MESA_SHADER_FRAGMENT: 176 -> 178 (1.14%) total instructions in shared programs: 148370 -> 148360 (<.01%) instructions in affected programs: 300 -> 290 (-3.33%) helped: 2 HURT: 0 total tuples in shared programs: 124188 -> 124184 (<.01%) tuples in affected programs: 238 -> 234 (-1.68%) helped: 1 HURT: 1 helped stats (abs) min: 5.0 max: 5.0 x̄: 5.00 x̃: 5 helped stats (rel) min: 18.52% max: 18.52% x̄: 18.52% x̃: 18.52% HURT stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.47% max: 0.47% x̄: 0.47% x̃: 0.47% total clauses in shared programs: 25692 -> 25693 (<.01%) clauses in affected programs: 32 -> 33 (3.12%) helped: 0 HURT: 1 total cycles in shared programs: 12132.04 -> 12131.83 (<.01%) cycles in affected programs: 1 -> 0.79 (-20.83%) helped: 1 HURT: 0 total arith in shared programs: 4623.75 -> 4623.54 (<.01%) arith in affected programs: 1 -> 0.79 (-20.83%) helped: 1 HURT: 0 total quadwords in shared programs: 110386 -> 110385 (<.01%) quadwords in affected programs: 207 -> 206 (-0.48%) helped: 1 HURT: 1 helped stats (abs) min: 3.0 max: 3.0 x̄: 3.00 x̃: 3 helped stats (rel) min: 9.68% max: 9.68% x̄: 9.68% x̃: 9.68% HURT stats (abs) min: 2.0 max: 2.0 x̄: 2.00 x̃: 2 HURT stats (rel) min: 1.14% max: 1.14% x̄: 1.14% x̃: 1.14% Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12332>
2021-10-02 09:02:36 -04:00 · 2021-10-02 09:02:36 -04:00 · c00e7b729f
parent 3e8f540753
commit c00e7b729f
2 changed files with 28 additions and 0 deletions
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@ -1696,6 +1696,23 @@ bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
 }

+/* The XOR lane op is useful for derivative calculation, but was added in v7.
+ * Add a safe helper that will do the appropriate lowering on v6 */
+
+static bi_index
+bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1)
+{
+        if (b->shader->arch >= 7) {
+                return bi_clper_i32(b, s0, s1,
+                                BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR,
+                                BI_SUBGROUP_SUBGROUP4);
+        }
+
+        bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
+        bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
+        return bi_clper_v6_i32(b, s0, lane);
+}
+
 static bi_instr *
 bi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op,
      bi_index dst, bi_index s0, bi_index s1, bi_index s2)
@ -2011,6 +2028,14 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                                s0, s1, s0, s1, BI_CMPF_GT);
                break;

+        case nir_op_fddx_must_abs_mali:
+        case nir_op_fddy_must_abs_mali: {
+                bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
+                bi_index adjacent = bi_clper_xor(b, s0, bit);
+                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE);
+                break;
+        }
+
        case nir_op_fddx:
        case nir_op_fddy: {
                bi_index lane1 = bi_lshift_and_i32(b,
--- a/src/panfrost/bifrost/bifrost_nir_algebraic.py
+++ b/src/panfrost/bifrost/bifrost_nir_algebraic.py
@ -36,6 +36,9 @@ algebraic_late = [
    (('fmin', ('fmax', a, -1.0), 1.0), ('fsat_signed_mali', a)),
    (('fmax', ('fmin', a, 1.0), -1.0), ('fsat_signed_mali', a)),
    (('fmax', a, 0.0), ('fclamp_pos_mali', a)),
+
+    (('fabs', ('fddx', a)), ('fabs', ('fddx_must_abs_mali', a))),
+    (('fabs', ('fddy', b)), ('fabs', ('fddy_must_abs_mali', b))),
 ]