pan/bi: Lower stores with component != 0

If the shader packs multiple varyings into the same location with different location_frac, we'll need to lower to a single varying store that collects all of the channels together. This is not trivial during code gen, but it is trivial to do in NIR right before codegen by relying on nir_lower_io_to_temporaries. Since we're guaranteed all varyings will be written exactly once, in the exit block, we can scan the shader linearly and collect stores together in a single pass. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11123>
2021-06-02 10:46:57 -04:00 · 2021-06-02 10:46:57 -04:00 · 95458c4033
parent de42707101
commit 95458c4033
2 changed files with 64 additions and 12 deletions
--- a/src/gallium/drivers/panfrost/ci/deqp-panfrost-g52-fails.txt
+++ b/src/gallium/drivers/panfrost/ci/deqp-panfrost-g52-fails.txt
@ -14,19 +14,7 @@ dEQP-GLES31.functional.draw_indirect.draw_elements_indirect.line_strip.instanced
 dEQP-GLES31.functional.draw_indirect.random.31,Fail
 dEQP-GLES31.functional.layout_binding.image.image2d.vertex_binding_max_array,Fail
 dEQP-GLES31.functional.layout_binding.image.image3d.vertex_binding_max_array,Fail
-dEQP-GLES31.functional.separate_shader.random.22,Fail
 dEQP-GLES31.functional.separate_shader.random.23,Fail
 dEQP-GLES31.functional.separate_shader.random.35,Fail
 dEQP-GLES31.functional.separate_shader.random.68,Fail
 dEQP-GLES31.functional.separate_shader.random.79,Fail
-dEQP-GLES31.functional.separate_shader.random.80,Fail
-dEQP-GLES31.functional.separate_shader.random.89,Fail
-dEQP-GLES31.functional.draw_base_vertex.draw_elements_base_vertex.builtin_variable.vertex_id,Fail
-dEQP-GLES31.functional.draw_base_vertex.draw_elements_instanced_base_vertex.builtin_variable.vertex_id,Fail
-dEQP-GLES31.functional.draw_base_vertex.draw_range_elements_base_vertex.builtin_variable.vertex_id,Fail
-dEQP-GLES31.functional.separate_shader.interface.same_location_vertex_flat_fragment_flat,Fail
-dEQP-GLES31.functional.separate_shader.interface.same_location_vertex_smooth_fragment_centroid,Fail
-dEQP-GLES31.functional.separate_shader.interface.same_name_vertex_flat_fragment_flat,Fail
-dEQP-GLES31.functional.separate_shader.pipeline.different_constant_separate_programs_add_fragment,Fail
-dEQP-GLES31.functional.separate_shader.pipeline.same_constant_separate_programs_add_both,Fail
-dEQP-GLES31.functional.separate_shader.program_uniform.separate_programs_add_fragment,Fail
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@ -3199,6 +3199,64 @@ bi_opt_post_ra(bi_context *ctx)
        }
 }

+static bool
+bifrost_nir_lower_store_component(struct nir_builder *b,
+                nir_instr *instr, void *data)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return false;
+
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        if (intr->intrinsic != nir_intrinsic_store_output)
+                return false;
+
+        struct hash_table_u64 *slots = data;
+        unsigned component = nir_intrinsic_component(intr);
+        nir_src *slot_src = nir_get_io_offset_src(intr);
+        uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
+
+        nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
+        unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
+
+        nir_ssa_def *value = intr->src[0].ssa;
+        b->cursor = nir_before_instr(&intr->instr);
+
+        nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size);
+        nir_ssa_def *channels[4] = { undef, undef, undef, undef };
+
+        /* Copy old */
+        u_foreach_bit(i, mask) {
+                assert(prev != NULL);
+                nir_ssa_def *prev_ssa = prev->src[0].ssa;
+                channels[i] = nir_channel(b, prev_ssa, i);
+        }
+
+        /* Copy new */
+        unsigned new_mask = nir_intrinsic_write_mask(intr);
+        mask |= (new_mask << component);
+
+        u_foreach_bit(i, new_mask) {
+                assert(component + i < 4);
+                channels[component + i] = nir_channel(b, value, i);
+        }
+
+        intr->num_components = util_last_bit(mask);
+        nir_instr_rewrite_src_ssa(instr, &intr->src[0], 
+                        nir_vec(b, channels, intr->num_components));
+
+        nir_intrinsic_set_component(intr, 0);
+        nir_intrinsic_set_write_mask(intr, mask);
+
+        if (prev) {
+                _mesa_hash_table_u64_remove(slots, slot);
+                nir_instr_remove(&prev->instr);
+        }
+
+        _mesa_hash_table_u64_insert(slots, slot, intr);
+        return false;
+}
+
 /* Dead code elimination for branches at the end of a block - only one branch
 * per block is legal semantically, but unreachable jumps can be generated.
 * Likewise we can generate jumps to the terminal block which need to be
@ -3273,6 +3331,12 @@ bifrost_compile_shader_nir(nir_shader *nir,
        if (ctx->stage == MESA_SHADER_FRAGMENT) {
                NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
                                ~0, false);
+        } else {
+                struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx);
+                NIR_PASS_V(nir, nir_shader_instructions_pass,
+                                bifrost_nir_lower_store_component,
+                                nir_metadata_block_index |
+                                nir_metadata_dominance, stores);
        }

        NIR_PASS_V(nir, nir_lower_ssbo);