lavapipe: implement extreme uniform inlining

this adds a mangled variation on nir_inline_uniforms that enables inlining from any uniform buffer in order to try inlining every possible load if the shader is too small or the ssa_alloc delta from inlining is too small, then inlining is disabled for that shader to avoid pointlessly churning the same shaders for no gain with certain types of shaders, the speedup is astronomical before: dEQP-VK.graphicsfuzz.cov-int-initialize-from-multiple-large-arrays (4750.76s) after: dEQP-VK.graphicsfuzz.cov-int-initialize-from-multiple-large-arrays (0.505s) Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17722>
2022-06-27 14:02:46 -04:00 · 2022-06-27 14:02:46 -04:00 · d23a9380dd
parent 1a244e1394
commit d23a9380dd
5 changed files with 608 additions and 16 deletions
--- a/src/gallium/frontends/lavapipe/lvp_execute.c
+++ b/src/gallium/frontends/lavapipe/lvp_execute.c
@ -83,6 +83,7 @@ struct rendering_state {
   bool constbuf_dirty[PIPE_SHADER_TYPES];
   bool pcbuf_dirty[PIPE_SHADER_TYPES];
   bool has_pcbuf[PIPE_SHADER_TYPES];
+   bool inlines_dirty[PIPE_SHADER_TYPES];
   bool vp_dirty;
   bool scissor_dirty;
   bool ib_dirty;
@ -177,6 +178,8 @@ struct rendering_state {
   uint32_t num_so_targets;
   struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_BUFFERS];
   uint32_t so_offsets[PIPE_MAX_SO_BUFFERS];
+
+   struct lvp_pipeline *pipeline[2];
 };

 ALWAYS_INLINE static void
@ -256,6 +259,95 @@ update_pcbuf(struct rendering_state *state, enum pipe_shader_type pstage)
   state->pcbuf_dirty[pstage] = false;
 }

+static void
+update_inline_shader_state(struct rendering_state *state, enum pipe_shader_type sh, bool pcbuf_dirty, bool constbuf_dirty)
+{
+   bool is_compute = sh == PIPE_SHADER_COMPUTE;
+   uint32_t inline_uniforms[MAX_INLINABLE_UNIFORMS];
+   unsigned stage = tgsi_processor_to_shader_stage(sh);
+   state->inlines_dirty[sh] = false;
+   if (!state->pipeline[is_compute]->inlines[stage].can_inline)
+      return;
+   struct lvp_pipeline *pipeline = state->pipeline[is_compute];
+   /* these buffers have already been flushed in llvmpipe, so they're safe to read */
+   nir_shader *nir = nir_shader_clone(pipeline->pipeline_nir[stage], pipeline->pipeline_nir[stage]);
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   unsigned ssa_alloc = impl->ssa_alloc;
+   unsigned count = pipeline->inlines[stage].count[0];
+   if (count && pcbuf_dirty) {
+      unsigned push_size = get_pcbuf_size(state, sh);
+      for (unsigned i = 0; i < count; i++) {
+         unsigned offset = pipeline->inlines[stage].uniform_offsets[0][i];
+         if (offset < push_size) {
+            memcpy(&inline_uniforms[i], &state->push_constants[offset], sizeof(uint32_t));
+         } else {
+            for (unsigned i = 0; i < state->uniform_blocks[sh].count; i++) {
+               if (offset < push_size + state->uniform_blocks[sh].size[i]) {
+                  unsigned ubo_offset = offset - push_size;
+                  uint8_t *block = state->uniform_blocks[sh].block[i];
+                  memcpy(&inline_uniforms[i], &block[ubo_offset], sizeof(uint32_t));
+                  break;
+               }
+               push_size += state->uniform_blocks[sh].size[i];
+            }
+         }
+      }
+      NIR_PASS_V(nir, lvp_inline_uniforms, pipeline, inline_uniforms, 0);
+   }
+   if (constbuf_dirty) {
+      struct pipe_box box = {0};
+      u_foreach_bit(slot, pipeline->inlines[stage].can_inline) {
+         unsigned count = pipeline->inlines[stage].count[slot];
+         struct pipe_constant_buffer *cbuf = &state->const_buffer[sh][slot - 1];
+         struct pipe_resource *pres = cbuf->buffer;
+         box.x = cbuf->buffer_offset;
+         box.width = cbuf->buffer_size - cbuf->buffer_offset;
+         struct pipe_transfer *xfer;
+         uint8_t *map = state->pctx->buffer_map(state->pctx, pres, 0, PIPE_MAP_READ, &box, &xfer);
+         for (unsigned i = 0; i < count; i++) {
+            unsigned offset = pipeline->inlines[stage].uniform_offsets[slot][i];
+            memcpy(&inline_uniforms[i], map + offset, sizeof(uint32_t));
+         }
+         state->pctx->buffer_unmap(state->pctx, xfer);
+         NIR_PASS_V(nir, lvp_inline_uniforms, pipeline, inline_uniforms, slot);
+      }
+   }
+   lvp_shader_optimize(nir);
+   impl = nir_shader_get_entrypoint(nir);
+   void *shader_state;
+   if (ssa_alloc - impl->ssa_alloc < ssa_alloc / 2 &&
+       !pipeline->inlines[stage].must_inline) {
+      /* not enough change; don't inline further */
+      pipeline->inlines[stage].can_inline = 0;
+      ralloc_free(nir);
+      pipeline->shader_cso[sh] = lvp_pipeline_compile(pipeline, nir_shader_clone(NULL, pipeline->pipeline_nir[stage]));
+      shader_state = pipeline->shader_cso[sh];
+   } else {
+      shader_state = lvp_pipeline_compile(pipeline, nir);
+   }
+   switch (sh) {
+   case PIPE_SHADER_VERTEX:
+      state->pctx->bind_vs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      state->pctx->bind_tcs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      state->pctx->bind_tes_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      state->pctx->bind_gs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      state->pctx->bind_fs_state(state->pctx, shader_state);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      state->pctx->bind_compute_state(state->pctx, shader_state);
+      break;
+   default: break;
+   }
+}
+
 static void emit_compute_state(struct rendering_state *state)
 {
   if (state->iv_dirty[PIPE_SHADER_COMPUTE]) {
@ -265,9 +357,11 @@ static void emit_compute_state(struct rendering_state *state)
      state->iv_dirty[PIPE_SHADER_COMPUTE] = false;
   }

+   bool pcbuf_dirty = state->pcbuf_dirty[PIPE_SHADER_COMPUTE];
   if (state->pcbuf_dirty[PIPE_SHADER_COMPUTE])
      update_pcbuf(state, PIPE_SHADER_COMPUTE);

+   bool constbuf_dirty = state->constbuf_dirty[PIPE_SHADER_COMPUTE];
   if (state->constbuf_dirty[PIPE_SHADER_COMPUTE]) {
      for (unsigned i = 0; i < state->num_const_bufs[PIPE_SHADER_COMPUTE]; i++)
         state->pctx->set_constant_buffer(state->pctx, PIPE_SHADER_COMPUTE,
@ -275,6 +369,9 @@ static void emit_compute_state(struct rendering_state *state)
      state->constbuf_dirty[PIPE_SHADER_COMPUTE] = false;
   }

+   if (state->inlines_dirty[PIPE_SHADER_COMPUTE])
+      update_inline_shader_state(state, PIPE_SHADER_COMPUTE, pcbuf_dirty, constbuf_dirty);
+
   if (state->sb_dirty[PIPE_SHADER_COMPUTE]) {
      state->pctx->set_shader_buffers(state->pctx, PIPE_SHADER_COMPUTE,
                                      0, state->num_shader_buffers[PIPE_SHADER_COMPUTE],
@ -379,9 +476,11 @@ static void emit_state(struct rendering_state *state)
      cso_set_vertex_elements(state->cso, &state->velem);
      state->ve_dirty = false;
   }
-   

+   bool constbuf_dirty[PIPE_SHADER_TYPES] = {false};
+   bool pcbuf_dirty[PIPE_SHADER_TYPES] = {false};
   for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
+      constbuf_dirty[sh] = state->constbuf_dirty[sh];
      if (state->constbuf_dirty[sh]) {
         for (unsigned idx = 0; idx < state->num_const_bufs[sh]; idx++)
            state->pctx->set_constant_buffer(state->pctx, sh,
@ -391,10 +490,16 @@ static void emit_state(struct rendering_state *state)
   }

   for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
+      pcbuf_dirty[sh] = state->pcbuf_dirty[sh];
      if (state->pcbuf_dirty[sh])
         update_pcbuf(state, sh);
   }

+   for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
+      if (state->inlines_dirty[sh])
+         update_inline_shader_state(state, sh, pcbuf_dirty[sh], constbuf_dirty[sh]);
+   }
+
   for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) {
      if (state->sb_dirty[sh]) {
         state->pctx->set_shader_buffers(state->pctx, sh,
@ -462,7 +567,9 @@ static void handle_compute_pipeline(struct vk_cmd_queue_entry *cmd,
   state->dispatch_info.block[0] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0];
   state->dispatch_info.block[1] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[1];
   state->dispatch_info.block[2] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[2];
-   state->pctx->bind_compute_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_COMPUTE]);
+   state->inlines_dirty[PIPE_SHADER_COMPUTE] = pipeline->inlines[MESA_SHADER_COMPUTE].can_inline;
+   if (!pipeline->inlines[MESA_SHADER_COMPUTE].can_inline)
+      state->pctx->bind_compute_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_COMPUTE]);
 }

 static void
@ -620,24 +727,34 @@ static void handle_graphics_pipeline(struct vk_cmd_queue_entry *cmd,
         const VkPipelineShaderStageCreateInfo *sh = &pipeline->graphics_create_info.pStages[i];
         switch (sh->stage) {
         case VK_SHADER_STAGE_FRAGMENT_BIT:
-            state->pctx->bind_fs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_FRAGMENT]);
+            state->inlines_dirty[PIPE_SHADER_FRAGMENT] = pipeline->inlines[MESA_SHADER_FRAGMENT].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_FRAGMENT].can_inline)
+               state->pctx->bind_fs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_FRAGMENT]);
            has_stage[PIPE_SHADER_FRAGMENT] = true;
            break;
         case VK_SHADER_STAGE_VERTEX_BIT:
-            state->pctx->bind_vs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_VERTEX]);
+            state->inlines_dirty[PIPE_SHADER_VERTEX] = pipeline->inlines[MESA_SHADER_VERTEX].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_VERTEX].can_inline)
+               state->pctx->bind_vs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_VERTEX]);
            has_stage[PIPE_SHADER_VERTEX] = true;
            break;
         case VK_SHADER_STAGE_GEOMETRY_BIT:
-            state->pctx->bind_gs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_GEOMETRY]);
+            state->inlines_dirty[PIPE_SHADER_GEOMETRY] = pipeline->inlines[MESA_SHADER_GEOMETRY].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_GEOMETRY].can_inline)
+               state->pctx->bind_gs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_GEOMETRY]);
            state->gs_output_lines = pipeline->gs_output_lines ? GS_OUTPUT_LINES : GS_OUTPUT_NOT_LINES;
            has_stage[PIPE_SHADER_GEOMETRY] = true;
            break;
         case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
-            state->pctx->bind_tcs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_CTRL]);
+            state->inlines_dirty[PIPE_SHADER_TESS_CTRL] = pipeline->inlines[MESA_SHADER_TESS_CTRL].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_TESS_CTRL].can_inline)
+               state->pctx->bind_tcs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_CTRL]);
            has_stage[PIPE_SHADER_TESS_CTRL] = true;
            break;
         case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
-            state->pctx->bind_tes_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_EVAL]);
+            state->inlines_dirty[PIPE_SHADER_TESS_EVAL] = pipeline->inlines[MESA_SHADER_TESS_EVAL].can_inline;
+            if (!pipeline->inlines[MESA_SHADER_TESS_EVAL].can_inline)
+               state->pctx->bind_tes_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_EVAL]);
            has_stage[PIPE_SHADER_TESS_EVAL] = true;
            break;
         default:
@ -995,6 +1112,7 @@ static void handle_pipeline(struct vk_cmd_queue_entry *cmd,
   else
      handle_graphics_pipeline(cmd, state);
   state->push_size[pipeline->is_compute_pipeline] = pipeline->layout->push_constant_size;
+   state->pipeline[pipeline->is_compute_pipeline] = pipeline;
 }

 static void handle_vertex_buffers2(struct vk_cmd_queue_entry *cmd,
@ -1306,6 +1424,7 @@ static void handle_descriptor(struct rendering_state *state,
      assert(descriptor->uniform);
      state->uniform_blocks[p_stage].block[idx] = descriptor->uniform;
      state->pcbuf_dirty[p_stage] = true;
+      state->inlines_dirty[p_stage] = true;
      break;
   }
   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
@ -1339,6 +1458,7 @@ static void handle_descriptor(struct rendering_state *state,
      if (state->num_const_bufs[p_stage] <= idx)
         state->num_const_bufs[p_stage] = idx + 1;
      state->constbuf_dirty[p_stage] = true;
+      state->inlines_dirty[p_stage] = true;
      break;
   }
   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
@ -2699,6 +2819,12 @@ static void handle_push_constants(struct vk_cmd_queue_entry *cmd,
   state->pcbuf_dirty[PIPE_SHADER_TESS_CTRL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) > 0;
   state->pcbuf_dirty[PIPE_SHADER_TESS_EVAL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) > 0;
   state->pcbuf_dirty[PIPE_SHADER_COMPUTE] |= (stage_flags & VK_SHADER_STAGE_COMPUTE_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_VERTEX] |= (stage_flags & VK_SHADER_STAGE_VERTEX_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_FRAGMENT] |= (stage_flags & VK_SHADER_STAGE_FRAGMENT_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_GEOMETRY] |= (stage_flags & VK_SHADER_STAGE_GEOMETRY_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_TESS_CTRL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_TESS_EVAL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) > 0;
+   state->inlines_dirty[PIPE_SHADER_COMPUTE] |= (stage_flags & VK_SHADER_STAGE_COMPUTE_BIT) > 0;
 }

 static void lvp_execute_cmd_buffer(struct lvp_cmd_buffer *cmd_buffer,
--- a/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c
+++ b/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c
@ -0,0 +1,449 @@
+/*
+ * Copyright © 2020 Advanced Micro Devices, Inc.
+ * Copyright © 2022 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* enhanced version of nir_inline_uniforms that can inline from any uniform buffer
+ * see nir_inline_uniforms.c for more details
+ */
+
+#include "nir_builder.h"
+#include "nir_loop_analyze.h"
+#include "lvp_private.h"
+
+static bool
+src_only_uses_uniforms(const nir_src *src, int component,
+                       uint32_t *uni_offsets, uint8_t *num_offsets)
+{
+   if (!src->is_ssa)
+      return false;
+
+   assert(component < src->ssa->num_components);
+
+   nir_instr *instr = src->ssa->parent_instr;
+
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+      /* Vector ops only need to check the corresponding component. */
+      if (nir_op_is_vec(alu->op)) {
+         nir_alu_src *alu_src = alu->src + component;
+         return src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[0],
+                                       uni_offsets, num_offsets);
+      }
+
+      /* Return true if all sources return true. */
+      for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
+         nir_alu_src *alu_src = alu->src + i;
+         int input_sizes = nir_op_infos[alu->op].input_sizes[i];
+
+         if (input_sizes == 0) {
+            /* For ops which has no input size, each component of dest is
+             * only determined by the same component of srcs.
+             */
+            if (!src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[component],
+                                        uni_offsets, num_offsets))
+               return false;
+         } else {
+            /* For ops which has input size, all components of dest are
+             * determined by all components of srcs (except vec ops).
+             */
+            for (unsigned j = 0; j < input_sizes; j++) {
+               if (!src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[j],
+                                           uni_offsets, num_offsets))
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      /* Return true if the intrinsic loads with a constant offset. */
+      if (intr->intrinsic == nir_intrinsic_load_ubo &&
+          nir_src_is_const(intr->src[0]) &&
+          nir_src_is_const(intr->src[1]) &&
+          /* TODO: Can't handle other bit sizes for now. */
+          intr->dest.ssa.bit_size == 32) {
+         uint32_t offset = nir_src_as_uint(intr->src[1]) + component * 4;
+
+         /* Already recorded by other one */
+         uint32_t ubo = nir_src_as_uint(intr->src[0]);
+         for (int i = 0; uni_offsets && i < num_offsets[ubo]; i++) {
+            if (uni_offsets[ubo * PIPE_MAX_CONSTANT_BUFFERS + i] == offset)
+               return true;
+         }
+
+         /* Exceed uniform number limit */
+         if (num_offsets && num_offsets[ubo] == MAX_INLINABLE_UNIFORMS)
+            return false;
+
+         /* Record the uniform offset. */
+         if (uni_offsets)
+            uni_offsets[ubo * MAX_INLINABLE_UNIFORMS + num_offsets[ubo]++] = offset;
+         return true;
+      }
+      return false;
+   }
+
+   case nir_instr_type_load_const:
+      /* Always return true for constants. */
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+static bool
+is_induction_variable(const nir_src *src, int component, nir_loop_info *info,
+                      uint32_t *uni_offsets, uint8_t *num_offsets)
+{
+   if (!src->is_ssa)
+      return false;
+
+   assert(component < src->ssa->num_components);
+
+   /* Return true for induction variable (ie. i in for loop) */
+   for (int i = 0; i < info->num_induction_vars; i++) {
+      nir_loop_induction_variable *var = info->induction_vars + i;
+      if (var->def == src->ssa) {
+         /* Induction variable should have constant initial value (ie. i = 0),
+          * constant update value (ie. i++) and constant end condition
+          * (ie. i < 10), so that we know the exact loop count for unrolling
+          * the loop.
+          *
+          * Add uniforms need to be inlined for this induction variable's
+          * initial and update value to be constant, for example:
+          *
+          *     for (i = init; i < count; i += step)
+          *
+          * We collect uniform "init" and "step" here.
+          */
+         if (var->init_src) {
+            if (!src_only_uses_uniforms(var->init_src, component,
+                                        uni_offsets, num_offsets))
+               return false;
+         }
+
+         if (var->update_src) {
+            nir_alu_src *alu_src = var->update_src;
+            if (!src_only_uses_uniforms(&alu_src->src,
+                                        alu_src->swizzle[component],
+                                        uni_offsets, num_offsets))
+               return false;
+         }
+
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static void
+add_inlinable_uniforms(const nir_src *cond, nir_loop_info *info,
+                       uint32_t *uni_offsets, uint8_t *num_offsets)
+{
+   uint8_t new_num[PIPE_MAX_CONSTANT_BUFFERS];
+   memcpy(new_num, num_offsets, sizeof(new_num));
+   /* If condition SSA is always scalar, so component is 0. */
+   unsigned component = 0;
+
+   /* Allow induction variable which means a loop terminator. */
+   if (info) {
+      nir_ssa_scalar cond_scalar = {cond->ssa, 0};
+
+      /* Limit terminator condition to loop unroll support case which is a simple
+       * comparison (ie. "i < count" is supported, but "i + 1 < count" is not).
+       */
+      if (nir_is_supported_terminator_condition(cond_scalar)) {
+         nir_alu_instr *alu = nir_instr_as_alu(cond->ssa->parent_instr);
+
+         /* One side of comparison is induction variable, the other side is
+          * only uniform.
+          */
+         for (int i = 0; i < 2; i++) {
+            if (is_induction_variable(&alu->src[i].src, alu->src[i].swizzle[0],
+                                      info, uni_offsets, new_num)) {
+               cond = &alu->src[1 - i].src;
+               component = alu->src[1 - i].swizzle[0];
+               break;
+            }
+         }
+      }
+   }
+
+   /* Only update uniform number when all uniforms in the expression
+    * can be inlined. Partially inlines uniforms can't lower if/loop.
+    *
+    * For example, uniform can be inlined for a shader is limited to 4,
+    * and we have already added 3 uniforms, then want to deal with
+    *
+    *     if (uniform0 + uniform1 == 10)
+    *
+    * only uniform0 can be inlined due to we exceed the 4 limit. But
+    * unless both uniform0 and uniform1 are inlined, can we eliminate
+    * the if statement.
+    *
+    * This is even possible when we deal with loop if the induction
+    * variable init and update also contains uniform like
+    *
+    *    for (i = uniform0; i < uniform1; i+= uniform2)
+    *
+    * unless uniform0, uniform1 and uniform2 can be inlined at once,
+    * can the loop be unrolled.
+    */
+   if (src_only_uses_uniforms(cond, component, uni_offsets, new_num))
+      memcpy(num_offsets, new_num, sizeof(new_num));
+}
+
+static bool
+is_src_uniform_load(nir_src src)
+{
+   if (nir_src_bit_size(src) != 32 || nir_src_num_components(src) != 1 || nir_src_is_const(src))
+      return false;
+   return src_only_uses_uniforms(&src, 0, NULL, NULL);
+}
+
+static void
+process_node(nir_cf_node *node, nir_loop_info *info,
+             uint32_t *uni_offsets, uint8_t *num_offsets,
+             struct set *stores)
+{
+   switch (node->type) {
+   case nir_cf_node_if: {
+      nir_if *if_node = nir_cf_node_as_if(node);
+      const nir_src *cond = &if_node->condition;
+      add_inlinable_uniforms(cond, info, uni_offsets, num_offsets);
+
+      /* Do not pass loop info down so only alow induction variable
+       * in loop terminator "if":
+       *
+       *     for (i = 0; true; i++)
+       *         if (i == count)
+       *             if (i == num)
+       *                 <no break>
+       *             break
+       *
+       * so "num" won't be inlined due to the "if" is not a
+       * terminator.
+       */
+      info = NULL;
+
+      foreach_list_typed(nir_cf_node, nested_node, node, &if_node->then_list)
+         process_node(nested_node, info, uni_offsets, num_offsets, stores);
+      foreach_list_typed(nir_cf_node, nested_node, node, &if_node->else_list)
+         process_node(nested_node, info, uni_offsets, num_offsets, stores);
+      break;
+   }
+
+   case nir_cf_node_loop: {
+      nir_loop *loop = nir_cf_node_as_loop(node);
+
+      /* Replace loop info, no nested loop info currently:
+       *
+       *     for (i = 0; i < count0; i++)
+       *         for (j = 0; j < count1; j++)
+       *             if (i == num)
+       *
+       * so "num" won't be inlined due to "i" is an induction
+       * variable of upper loop.
+       */
+      info = loop->info;
+
+      foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) {
+         bool is_terminator = false;
+         list_for_each_entry(nir_loop_terminator, terminator,
+                             &info->loop_terminator_list,
+                             loop_terminator_link) {
+            if (nested_node == &terminator->nif->cf_node) {
+               is_terminator = true;
+               break;
+            }
+         }
+
+         /* Allow induction variables for terminator "if" only:
+          *
+          *     for (i = 0; i < count; i++)
+          *         if (i == num)
+          *             <no break>
+          *
+          * so "num" won't be inlined due to the "if" is not a
+          * terminator.
+          */
+         nir_loop_info *use_info = is_terminator ? info : NULL;
+         process_node(nested_node, use_info, uni_offsets, num_offsets, stores);
+      }
+      break;
+   }
+
+   case nir_cf_node_block: {
+      nir_block *block = nir_cf_node_as_block(node);
+      nir_foreach_instr(instr, block) {
+         if (instr->type == nir_instr_type_intrinsic) {
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            if (intr->intrinsic == nir_intrinsic_store_deref && is_src_uniform_load(intr->src[1]))
+               _mesa_set_add(stores, &intr->src[1]);
+         }
+      }
+      break;
+   }
+   default:
+      break;
+   }
+}
+
+bool
+lvp_find_inlinable_uniforms(struct lvp_pipeline *pipeline, nir_shader *shader)
+{
+   bool ret = false;
+   struct set *stores = _mesa_set_create(shader, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         nir_metadata_require(function->impl, nir_metadata_loop_analysis, nir_var_all);
+
+         foreach_list_typed(nir_cf_node, node, node, &function->impl->body)
+            process_node(node, NULL, (uint32_t*)pipeline->inlines[shader->info.stage].uniform_offsets, pipeline->inlines[shader->info.stage].count, stores);
+      }
+   }
+   const unsigned threshold = 5;
+   set_foreach(stores, entry) {
+      const nir_src *src = entry->key;
+      unsigned counter = 0;
+      list_for_each_entry(nir_src, rsrc, &src->ssa->uses, use_link) {
+         counter++;
+         if (counter >= threshold)
+            break;
+      }
+      if (counter >= threshold) {
+         uint8_t new_num[PIPE_MAX_CONSTANT_BUFFERS];
+         memcpy(new_num, pipeline->inlines[shader->info.stage].count, sizeof(new_num));
+         if (src_only_uses_uniforms(src, 0, (uint32_t*)pipeline->inlines[shader->info.stage].uniform_offsets, new_num)) {
+            ret = true;
+            memcpy(pipeline->inlines[shader->info.stage].count, new_num, sizeof(new_num));
+         }
+      }
+   }
+   for (unsigned i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      if (pipeline->inlines[shader->info.stage].count[i]) {
+         pipeline->inlines[shader->info.stage].can_inline |= BITFIELD_BIT(i);
+         break;
+      }
+   }
+   return ret;
+}
+
+void
+lvp_inline_uniforms(nir_shader *shader, const struct lvp_pipeline *pipeline, const uint32_t *uniform_values, uint32_t ubo)
+{
+   if (!pipeline->inlines[shader->info.stage].can_inline)
+      return;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+               /* Only replace loads with constant offsets. */
+               if (intr->intrinsic == nir_intrinsic_load_ubo &&
+                   nir_src_is_const(intr->src[0]) &&
+                   nir_src_as_uint(intr->src[0]) == ubo &&
+                   nir_src_is_const(intr->src[1]) &&
+                   /* TODO: Can't handle other bit sizes for now. */
+                   intr->dest.ssa.bit_size == 32) {
+                  int num_components = intr->dest.ssa.num_components;
+                  uint32_t offset = nir_src_as_uint(intr->src[1]);
+                  const unsigned num_uniforms = pipeline->inlines[shader->info.stage].count[ubo];
+                  const unsigned *uniform_dw_offsets = pipeline->inlines[shader->info.stage].uniform_offsets[ubo];
+
+                  if (num_components == 1) {
+                     /* Just replace the uniform load to constant load. */
+                     for (unsigned i = 0; i < num_uniforms; i++) {
+                        if (offset == uniform_dw_offsets[i]) {
+                           b.cursor = nir_before_instr(&intr->instr);
+                           nir_ssa_def *def = nir_imm_int(&b, uniform_values[i]);
+                           nir_ssa_def_rewrite_uses(&intr->dest.ssa, def);
+                           nir_instr_remove(&intr->instr);
+                           break;
+                        }
+                     }
+                  } else {
+                     /* Lower vector uniform load to scalar and replace each
+                      * found component load with constant load.
+                      */
+                     uint32_t max_offset = offset + num_components;
+                     nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS] = {0};
+                     bool found = false;
+
+                     b.cursor = nir_before_instr(&intr->instr);
+
+                     /* Find component to replace. */
+                     for (unsigned i = 0; i < num_uniforms; i++) {
+                        uint32_t uni_offset = uniform_dw_offsets[i];
+                        if (uni_offset >= offset && uni_offset < max_offset) {
+                           int index = uni_offset - offset;
+                           components[index] = nir_imm_int(&b, uniform_values[i]);
+                           found = true;
+                        }
+                     }
+
+                     if (!found)
+                        continue;
+
+                     /* Create per-component uniform load. */
+                     for (unsigned i = 0; i < num_components; i++) {
+                        if (!components[i]) {
+                           uint32_t scalar_offset = (offset + i) * 4;
+                           components[i] = nir_load_ubo(&b, 1, intr->dest.ssa.bit_size,
+                                                        intr->src[0].ssa,
+                                                        nir_imm_int(&b, scalar_offset));
+                           nir_intrinsic_instr *load =
+                              nir_instr_as_intrinsic(components[i]->parent_instr);
+                           nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX, scalar_offset);
+                           nir_intrinsic_set_range_base(load, scalar_offset);
+                           nir_intrinsic_set_range(load, 4);
+                        }
+                     }
+
+                     /* Replace the original uniform load. */
+                     nir_ssa_def_rewrite_uses(&intr->dest.ssa,
+                                              nir_vec(&b, components, num_components));
+                     nir_instr_remove(&intr->instr);
+                  }
+               }
+            }
+         }
+
+         nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                               nir_metadata_dominance);
+      }
+   }
+}
--- a/src/gallium/frontends/lavapipe/lvp_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c
@ -1060,6 +1060,9 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline,
   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                               nir->info.stage);

+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   if (impl->ssa_alloc > 100) //skip for small shaders
+      pipeline->inlines[stage].must_inline = lvp_find_inlinable_uniforms(pipeline, nir);
   pipeline->pipeline_nir[stage] = nir;

   return VK_SUCCESS;
@ -1180,15 +1183,12 @@ lvp_pipeline_compile_stage(struct lvp_pipeline *pipeline, nir_shader *nir)
   return NULL;
 }

-static VkResult
-lvp_pipeline_compile(struct lvp_pipeline *pipeline,
-                     gl_shader_stage stage)
+void *
+lvp_pipeline_compile(struct lvp_pipeline *pipeline, nir_shader *nir)
 {
   struct lvp_device *device = pipeline->device;
-   device->physical_device->pscreen->finalize_nir(device->physical_device->pscreen, pipeline->pipeline_nir[stage]);
-   nir_shader *nir = nir_shader_clone(NULL, pipeline->pipeline_nir[stage]);
-   pipeline->shader_cso[pipe_shader_type_from_mesa(stage)] = lvp_pipeline_compile_stage(pipeline, nir);
-   return VK_SUCCESS;
+   device->physical_device->pscreen->finalize_nir(device->physical_device->pscreen, nir);
+   return lvp_pipeline_compile_stage(pipeline, nir);
 }

 #ifndef NDEBUG
@ -1450,7 +1450,10 @@ lvp_graphics_pipeline_init(struct lvp_pipeline *pipeline,
         const VkPipelineShaderStageCreateInfo *sinfo =
            &pipeline->graphics_create_info.pStages[i];
         gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
-         lvp_pipeline_compile(pipeline, stage);
+         enum pipe_shader_type pstage = pipe_shader_type_from_mesa(stage);
+         if (!pipeline->inlines[stage].can_inline)
+            pipeline->shader_cso[pstage] = lvp_pipeline_compile(pipeline,
+                                                                nir_shader_clone(NULL, pipeline->pipeline_nir[stage]));
         if (stage == MESA_SHADER_FRAGMENT)
            has_fragment_shader = true;
      }
@ -1571,7 +1574,8 @@ lvp_compute_pipeline_init(struct lvp_pipeline *pipeline,
   if (result != VK_SUCCESS)
      return result;

-   lvp_pipeline_compile(pipeline, MESA_SHADER_COMPUTE);
+   if (!pipeline->inlines[MESA_SHADER_COMPUTE].can_inline)
+      pipeline->shader_cso[PIPE_SHADER_COMPUTE] = lvp_pipeline_compile(pipeline, nir_shader_clone(NULL, pipeline->pipeline_nir[MESA_SHADER_COMPUTE]));
   return VK_SUCCESS;
 }

--- a/src/gallium/frontends/lavapipe/lvp_private.h
+++ b/src/gallium/frontends/lavapipe/lvp_private.h
@ -434,6 +434,12 @@ struct lvp_pipeline {
   bool force_min_sample;
   nir_shader *pipeline_nir[MESA_SHADER_STAGES];
   void *shader_cso[PIPE_SHADER_TYPES];
+   struct {
+      uint32_t uniform_offsets[PIPE_MAX_CONSTANT_BUFFERS][MAX_INLINABLE_UNIFORMS];
+      uint8_t count[PIPE_MAX_CONSTANT_BUFFERS];
+      bool must_inline;
+      uint32_t can_inline; //bitmask
+   } inlines[MESA_SHADER_STAGES];
   gl_shader_stage last_vertex;
   struct pipe_stream_output_info stream_output;
   VkGraphicsPipelineCreateInfo graphics_create_info;
@ -625,6 +631,12 @@ void
 lvp_shader_optimize(nir_shader *nir);
 void *
 lvp_pipeline_compile_stage(struct lvp_pipeline *pipeline, nir_shader *nir);
+bool
+lvp_find_inlinable_uniforms(struct lvp_pipeline *pipeline, nir_shader *shader);
+void
+lvp_inline_uniforms(nir_shader *shader, const struct lvp_pipeline *pipeline, const uint32_t *uniform_values, uint32_t ubo);
+void *
+lvp_pipeline_compile(struct lvp_pipeline *pipeline, nir_shader *base_nir);
 #ifdef __cplusplus
 }
 #endif
--- a/src/gallium/frontends/lavapipe/meson.build
+++ b/src/gallium/frontends/lavapipe/meson.build
@ -18,6 +18,7 @@ liblvp_files = files(
    'lvp_util.c',
    'lvp_image.c',
    'lvp_formats.c',
+    'lvp_inline_uniforms.c',
    'lvp_lower_vulkan_resource.c',
    'lvp_lower_vulkan_resource.h',
    'lvp_lower_input_attachments.c',