tu: Split out some state into a separate struct

These bits of state will have to be treated specially when suspending/resuming a render pass, because they will need to be tracked across command buffers. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17378>
2022-06-29 11:03:25 +02:00 · 2022-06-29 11:03:25 +02:00 · 3aa20a4409
parent 9689433eee
commit 3aa20a4409
4 changed files with 86 additions and 83 deletions
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@ -480,7 +480,7 @@ fallback_use_bypass(const struct tu_render_pass *pass,
                    const struct tu_framebuffer *framebuffer,
                    const struct tu_cmd_buffer *cmd_buffer)
 {
-   if (cmd_buffer->state.drawcall_count > 5)
+   if (cmd_buffer->state.rp.drawcall_count > 5)
      return false;

   for (unsigned i = 0; i < pass->subpass_count; i++) {
@ -504,12 +504,12 @@ estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
 {
   const struct tu_cmd_state *state = &cmd->state;

-   if (!state->drawcall_count)
+   if (!state->rp.drawcall_count)
      return 0;

   /* sample count times drawcall_bandwidth_per_sample */
   return (uint64_t)avg_renderpass_sample_count *
-      state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
+      state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
 }

 bool
@ -583,12 +583,12 @@ tu_autotune_use_bypass(struct tu_autotune *at,
      if (TU_AUTOTUNE_DEBUG_LOG) {
         const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
         const float drawcall_bandwidth_per_sample =
-            (float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
-            cmd_buffer->state.drawcall_count;
+            (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
+            cmd_buffer->state.rp.drawcall_count;

         mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
               renderpass_key,
-               cmd_buffer->state.drawcall_count,
+               cmd_buffer->state.rp.drawcall_count,
               select_sysmem ? "sysmem" : "gmem");
         mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
               avg_samples,
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@ -569,7 +569,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
    * use_sysmem_rendering() should have made sure we only ended up here if no
    * XFB was used.
    */
-   if (cmd->state.xfb_used) {
+   if (cmd->state.rp.xfb_used) {
      assert(fb->binning_possible);
      return true;
   }
@ -579,7 +579,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
    * be multiplied by tile count.
    * See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131
    */
-   if (cmd->state.has_prim_generated_query_in_rp ||
+   if (cmd->state.rp.has_prim_generated_query_in_rp ||
       cmd->state.prim_generated_query_running_before_rp) {
      assert(fb->binning_possible);
      return true;
@ -607,20 +607,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
       cmd->state.render_area.extent.height == 0)
      return true;

-   if (cmd->state.has_tess)
+   if (cmd->state.rp.has_tess)
      return true;

-   if (cmd->state.disable_gmem)
+   if (cmd->state.rp.disable_gmem)
      return true;

   /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
-   if (cmd->state.xfb_used && !cmd->state.framebuffer->binning_possible)
+   if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible)
      return true;

   /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
    * GMEM rendering, see use_hw_binning.
    */
-   if ((cmd->state.has_prim_generated_query_in_rp ||
+   if ((cmd->state.rp.has_prim_generated_query_in_rp ||
        cmd->state.prim_generated_query_running_before_rp) &&
       !cmd->state.framebuffer->binning_possible)
      return true;
@ -1411,7 +1411,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
   }

   /* Predicate is changed in draw_cs so we have to re-emit it */
-   if (cmd->state.draw_cs_writes_to_cond_pred)
+   if (cmd->state.rp.draw_cs_writes_to_cond_pred)
      tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);

   tu_cs_emit_call(cs, &cmd->tile_store_cs);
@ -2270,7 +2270,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,

   tu_cond_exec_end(cs);

-   cmd->state.xfb_used = true;
+   cmd->state.rp.xfb_used = true;
 }

 VKAPI_ATTR void VKAPI_CALL
@ -2396,7 +2396,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
   }

   if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
-      cmd->state.has_tess = true;
+      cmd->state.rp.has_tess = true;

      /* maximum number of patches that can fit in tess factor/param buffers */
      uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
@ -3331,6 +3331,21 @@ tu_flush_for_stage(struct tu_cache_state *cache,
   }
 }

+static void
+tu_render_pass_state_merge(struct tu_render_pass_state *dst,
+                           const struct tu_render_pass_state *src)
+{
+   dst->xfb_used |= src->xfb_used;
+   dst->has_tess |= src->has_tess;
+   dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
+   dst->disable_gmem |= src->disable_gmem;
+   dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
+
+   dst->drawcall_count += src->drawcall_count;
+   dst->drawcall_bandwidth_per_sample_sum +=
+      src->drawcall_bandwidth_per_sample_sum;
+}
+
 VKAPI_ATTR void VKAPI_CALL
 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
                      uint32_t commandBufferCount,
@ -3370,28 +3385,13 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
            break;
         }

-         if (secondary->state.has_tess) {
-            cmd->state.has_tess = true;
-         }
-         if (secondary->state.disable_gmem)
-            cmd->state.disable_gmem = true;
-         if (secondary->state.xfb_used)
-            cmd->state.xfb_used = true;
-         if (secondary->state.has_prim_generated_query_in_rp)
-            cmd->state.has_prim_generated_query_in_rp = true;
-
-         cmd->state.drawcall_count += secondary->state.drawcall_count;
-         cmd->state.drawcall_bandwidth_per_sample_sum +=
-            secondary->state.drawcall_bandwidth_per_sample_sum;
-
-         cmd->state.draw_cs_writes_to_cond_pred |=
-            secondary->state.draw_cs_writes_to_cond_pred;
-
         /* If LRZ was made invalid in secondary - we should disable
          * LRZ retroactively for the whole renderpass.
          */
         if (!secondary->state.lrz.valid)
            cmd->state.lrz.valid = false;
+
+         tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
      } else {
         assert(tu_cs_is_empty(&secondary->draw_cs));
         assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
@ -3588,8 +3588,6 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
      return;
   }

-   cmd->state.draw_cs_writes_to_cond_pred = false;
-
   for (unsigned i = 0; i < pass->attachment_count; i++) {
      cmd->state.attachments[i] = pAttachmentInfo ?
         tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
@ -3635,8 +3633,6 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,

   cmd->state.attachments = cmd->dynamic_attachments;

-   cmd->state.draw_cs_writes_to_cond_pred = false;
-
   for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
      uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment;
      if (!pRenderingInfo->pColorAttachments[i].imageView)
@ -4020,23 +4016,23 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
   const struct tu_pipeline *pipeline = cmd->state.pipeline;

   /* Fill draw stats for autotuner */
-   cmd->state.drawcall_count++;
+   cmd->state.rp.drawcall_count++;

-   cmd->state.drawcall_bandwidth_per_sample_sum +=
+   cmd->state.rp.drawcall_bandwidth_per_sample_sum +=
      cmd->state.pipeline->color_bandwidth_per_sample;

   /* add depth memory bandwidth cost */
   const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
   if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
-      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
+      cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
   if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
-      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
+      cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;

   /* add stencil memory bandwidth cost */
   const uint32_t stencil_bandwidth =
      cmd->state.pipeline->stencil_cpp_per_sample;
   if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
-      cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
+      cmd->state.rp.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;

   tu_emit_cache_flush_renderpass(cmd, cs);

@ -4813,7 +4809,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)

   cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);

-   if (cmd_buffer->state.has_tess)
+   if (cmd_buffer->state.rp.has_tess)
      tu6_lazy_emit_tessfactor_addr(cmd_buffer);

   struct tu_renderpass_result *autotune_result = NULL;
@ -4839,12 +4835,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
   cmd_buffer->state.subpass = NULL;
   cmd_buffer->state.framebuffer = NULL;
   cmd_buffer->state.attachments = NULL;
-   cmd_buffer->state.has_tess = false;
-   cmd_buffer->state.xfb_used = false;
-   cmd_buffer->state.disable_gmem = false;
-   cmd_buffer->state.drawcall_count = 0;
-   cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
-   cmd_buffer->state.has_prim_generated_query_in_rp = false;
+   memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));

   /* LRZ is not valid next time we use it */
   cmd_buffer->state.lrz.valid = false;
@ -4969,7 +4960,7 @@ tu_barrier(struct tu_cmd_buffer *cmd,
       */
      if ((srcStage & ~framebuffer_space_stages) ||
          (dstStage & ~framebuffer_space_stages)) {
-         cmd->state.disable_gmem = true;
+         cmd->state.rp.disable_gmem = true;
      }
   }

--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -1285,6 +1285,48 @@ struct tu_vs_params {
   uint32_t first_instance;
 };

+/* This should be for state that is set inside a renderpass and used at
+ * renderpass end time, e.g. to decide whether to use sysmem. This needs
+ * special handling for secondary cmdbufs and suspending/resuming render
+ * passes where the state may need to be combined afterwards.
+ */
+struct tu_render_pass_state
+{
+   bool xfb_used;
+   bool has_tess;
+   bool has_prim_generated_query_in_rp;
+   bool disable_gmem;
+
+   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
+   bool draw_cs_writes_to_cond_pred;
+
+   uint32_t drawcall_count;
+
+   /* A calculated "draw cost" value for renderpass, which tries to
+    * estimate the bandwidth-per-sample of all the draws according
+    * to:
+    *
+    *    foreach_draw (...) {
+    *      sum += pipeline->color_bandwidth_per_sample;
+    *      if (depth_test_enabled)
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (depth_write_enabled)
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (stencil_write_enabled)
+    *        sum += pipeline->stencil_cpp_per_sample * 2;
+    *    }
+    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
+    *
+    * It allows us to estimate the total bandwidth of drawcalls later, by
+    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
+    *
+    * This does ignore depth buffer traffic for samples which do not
+    * pass due to depth-test fail, and some other details.  But it is
+    * just intended to be a rough estimate that is easy to calculate.
+    */
+   uint32_t drawcall_bandwidth_per_sample_sum;
+};
+
 struct tu_cmd_state
 {
   uint32_t dirty;
@ -1292,6 +1334,8 @@ struct tu_cmd_state
   struct tu_pipeline *pipeline;
   struct tu_pipeline *compute_pipeline;

+   struct tu_render_pass_state rp;
+
   /* Vertex buffers, viewports, and scissors
    * the states for these can be updated partially, so we need to save these
    * to be able to emit a complete draw state
@ -1358,43 +1402,12 @@ struct tu_cmd_state
   VkRect2D render_area;

   const struct tu_image_view **attachments;
-   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
-   bool draw_cs_writes_to_cond_pred;

-   bool xfb_used;
-   bool has_tess;
   bool tessfactor_addr_set;
   bool predication_active;
-   bool disable_gmem;
   enum a5xx_line_mode line_mode;
   bool z_negative_one_to_one;

-   uint32_t drawcall_count;
-
-   /* A calculated "draw cost" value for renderpass, which tries to
-    * estimate the bandwidth-per-sample of all the draws according
-    * to:
-    *
-    *    foreach_draw (...) {
-    *      sum += pipeline->color_bandwidth_per_sample;
-    *      if (depth_test_enabled)
-    *        sum += pipeline->depth_cpp_per_sample;
-    *      if (depth_write_enabled)
-    *        sum += pipeline->depth_cpp_per_sample;
-    *      if (stencil_write_enabled)
-    *        sum += pipeline->stencil_cpp_per_sample * 2;
-    *    }
-    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
-    *
-    * It allows us to estimate the total bandwidth of drawcalls later, by
-    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
-    *
-    * This does ignore depth buffer traffic for samples which do not
-    * pass due to depth-test fail, and some other details.  But it is
-    * just intended to be a rough estimate that is easy to calculate.
-    */
-   uint32_t drawcall_bandwidth_per_sample_sum;
-
   /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
    * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
    * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
@ -1402,7 +1415,6 @@ struct tu_cmd_state
   uint32_t prim_counters_running;

   bool prim_generated_query_running_before_rp;
-   bool has_prim_generated_query_in_rp;

   struct tu_lrz_state lrz;

--- a/src/freedreno/vulkan/tu_query.c
+++ b/src/freedreno/vulkan/tu_query.c
@ -932,7 +932,7 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
   uint32_t last_pass = ~0;

   if (cmdbuf->state.pass) {
-      cmdbuf->state.draw_cs_writes_to_cond_pred = true;
+      cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
   }

   /* Querying perf counters happens in these steps:
@ -1023,7 +1023,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
   uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);

   if (cmdbuf->state.pass) {
-      cmdbuf->state.has_prim_generated_query_in_rp = true;
+      cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
   } else {
      cmdbuf->state.prim_generated_query_running_before_rp = true;
   }