From 90a39cac87f415375a70e1cb2f7ba2c486f941e4 Mon Sep 17 00:00:00 2001
From: Jordan Justen <jordan.l.justen@intel.com>
Date: Fri, 26 Oct 2018 12:52:44 -0700
Subject: [PATCH] intel/blorp: Emit compute program based on
 BLORP_BATCH_USE_COMPUTE

Reworks:
 * Don't pack params, just memcpy param struct (s-b Jason)
 * Old subject: "intel/blorp: Emit compute program if
   params.cs_prog_data is set"
 * Various cleanups of push-const size/alignment (s-b Jason)
 * Fix subslice count by moving to devinfo (s-b Ken)
 * Simplify cw.InterfaceDescriptor code (s-b Ken)
 * Drop some comments from i965 (s-b Ken)

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11564>
---
 src/intel/blorp/blorp.h           |   5 +
 src/intel/blorp/blorp_genX_exec.h | 269 ++++++++++++++++++++++++++++--
 2 files changed, 264 insertions(+), 10 deletions(-)

diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h
index 4b5d5950b59..1e40b4191ae 100644
--- a/src/intel/blorp/blorp.h
+++ b/src/intel/blorp/blorp.h
@@ -78,6 +78,11 @@ enum blorp_batch_flags {
     * color buffer.
     */
    BLORP_BATCH_NO_UPDATE_CLEAR_COLOR = (1 << 2),
+
+   /* This flag indicates that blorp should use a compute program for the
+    * operation.
+    */
+   BLORP_BATCH_USE_COMPUTE = (1 << 3),
 };
 
 struct blorp_batch {
diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h
index 6bb0b63e474..9cb89224466 100644
--- a/src/intel/blorp/blorp_genX_exec.h
+++ b/src/intel/blorp/blorp_genX_exec.h
@@ -1970,17 +1970,8 @@ blorp_update_clear_color(UNUSED struct blorp_batch *batch,
    }
 }
 
-/**
- * \brief Execute a blit or render pass operation.
- *
- * To execute the operation, this function manually constructs and emits a
- * batch to draw a rectangle primitive. The batchbuffer is flushed before
- * constructing and after emitting the batch.
- *
- * This function alters no GL state.
- */
 static void
-blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
+blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
 {
    if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
       blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
@@ -2017,4 +2008,262 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
    }
 }
 
+#if GFX_VER >= 7
+
+static void
+blorp_get_compute_push_const(struct blorp_batch *batch,
+                             const struct blorp_params *params,
+                             uint32_t threads,
+                             uint32_t *state_offset,
+                             unsigned *state_size)
+{
+   const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
+   const unsigned push_const_size =
+      ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
+   assert(cs_prog_data->push.cross_thread.size +
+          cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
+
+   if (push_const_size == 0) {
+      *state_offset = 0;
+      *state_size = 0;
+      return;
+   }
+
+   uint32_t push_const_offset;
+   uint32_t *push_const =
+      GFX_VERx10 >= 125 ?
+      blorp_alloc_general_state(batch, push_const_size, 64,
+                                &push_const_offset) :
+      blorp_alloc_dynamic_state(batch, push_const_size, 64,
+                                &push_const_offset);
+   memset(push_const, 0x0, push_const_size);
+
+   void *dst = push_const;
+   const void *src = (char *)&params->wm_inputs;
+
+   if (cs_prog_data->push.cross_thread.size > 0) {
+      memcpy(dst, src, cs_prog_data->push.cross_thread.size);
+      dst += cs_prog_data->push.cross_thread.size;
+      src += cs_prog_data->push.cross_thread.size;
+   }
+
+   assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0);
+#if GFX_VERx10 < 125
+   if (cs_prog_data->push.per_thread.size > 0) {
+      for (unsigned t = 0; t < threads; t++) {
+         memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
+
+         uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
+         *subgroup_id = t;
+
+         dst += cs_prog_data->push.per_thread.size;
+      }
+   }
+#endif
+
+   *state_offset = push_const_offset;
+   *state_size = push_const_size;
+}
+
+#endif /* GFX_VER >= 7 */
+
+static void
+blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
+{
+   assert(!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR));
+   assert(params->hiz_op == ISL_AUX_OP_NONE);
+
+#if GFX_VER >= 7
+
+   const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
+   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+   const struct brw_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(batch->blorp->compiler->devinfo, cs_prog_data,
+                               NULL);
+   const struct intel_device_info *devinfo = batch->blorp->compiler->devinfo;
+
+   uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
+   uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
+   uint32_t group_z0 = params->dst.z_offset;
+   uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
+   uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
+   assert(params->num_layers >= 1);
+   uint32_t group_z1 = params->dst.z_offset + params->num_layers;
+   assert(cs_prog_data->local_size[2] == 1);
+
+#endif /* GFX_VER >= 7 */
+
+#if GFX_VERx10 >= 125
+
+   blorp_emit(batch, GENX(CFE_STATE), cfe) {
+      cfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total - 1;
+   }
+
+   assert(cs_prog_data->push.per_thread.regs == 0);
+   blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
+      cw.SIMDSize                       = dispatch.simd_size / 16;
+      cw.LocalXMaximum                  = cs_prog_data->local_size[0] - 1;
+      cw.LocalYMaximum                  = cs_prog_data->local_size[1] - 1;
+      cw.LocalZMaximum                  = cs_prog_data->local_size[2] - 1;
+      cw.ThreadGroupIDStartingX         = group_x0;
+      cw.ThreadGroupIDStartingY         = group_y0;
+      cw.ThreadGroupIDStartingZ         = group_z0;
+      cw.ThreadGroupIDXDimension        = group_x1;
+      cw.ThreadGroupIDYDimension        = group_y1;
+      cw.ThreadGroupIDZDimension        = group_z1;
+      cw.ExecutionMask                  = 0xffffffff;
+
+      uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
+
+      uint32_t samplers_offset =
+         params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
+
+      uint32_t push_const_offset;
+      unsigned push_const_size;
+      blorp_get_compute_push_const(batch, params, dispatch.threads,
+                                   &push_const_offset, &push_const_size);
+      cw.IndirectDataStartAddress       = push_const_offset;
+      cw.IndirectDataLength             = push_const_size;
+
+      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+         .KernelStartPointer = params->cs_prog_kernel,
+         .SamplerStatePointer = samplers_offset,
+         .SamplerCount = params->src.enabled ? 1 : 0,
+         .BindingTableEntryCount = params->src.enabled ? 2 : 1,
+         .BindingTablePointer = surfaces_offset,
+         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+         .SharedLocalMemorySize =
+            encode_slm_size(GFX_VER, prog_data->total_shared),
+         .NumberOfBarriers = cs_prog_data->uses_barrier,
+      };
+   }
+
+#elif GFX_VER >= 7
+
+   /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
+    *
+    * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+    *  the only bits that are changed are scoreboard related: Scoreboard
+    *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+    *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
+    *
+    * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
+    * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
+    */
+   blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
+      pc.CommandStreamerStallEnable = true;
+      pc.StallAtPixelScoreboard = true;
+   }
+
+   blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
+      assert(prog_data->total_scratch == 0);
+      vfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total - 1;
+      vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0;
+#if GFX_VER < 11
+      vfe.ResetGatewayTimer =
+         Resettingrelativetimerandlatchingtheglobaltimestamp;
+#endif
+#if GFX_VER < 9
+      vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
+#endif
+#if GFX_VER == 7
+      vfe.GPGPUMode = 1;
+#endif
+      vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0;
+
+      const uint32_t vfe_curbe_allocation =
+         ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
+               cs_prog_data->push.cross_thread.regs, 2);
+      vfe.CURBEAllocationSize = vfe_curbe_allocation;
+   }
+
+   uint32_t push_const_offset;
+   unsigned push_const_size;
+   blorp_get_compute_push_const(batch, params, dispatch.threads,
+                                &push_const_offset, &push_const_size);
+
+   blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+      curbe.CURBETotalDataLength = push_const_size;
+      curbe.CURBEDataStartAddress = push_const_offset;
+   }
+
+   uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
+
+   uint32_t samplers_offset =
+      params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
+
+   struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
+      .KernelStartPointer = params->cs_prog_kernel,
+      .SamplerStatePointer = samplers_offset,
+      .SamplerCount = params->src.enabled ? 1 : 0,
+      .BindingTableEntryCount = params->src.enabled ? 2 : 1,
+      .BindingTablePointer = surfaces_offset,
+      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
+      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+      .SharedLocalMemorySize = encode_slm_size(GFX_VER,
+                                               prog_data->total_shared),
+      .BarrierEnable = cs_prog_data->uses_barrier,
+#if GFX_VER >= 8 || GEN_IS_HASWELL
+      .CrossThreadConstantDataReadLength =
+         cs_prog_data->push.cross_thread.regs,
+#endif
+   };
+
+   uint32_t idd_offset;
+   uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+   void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
+   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
+
+   blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+      mid.InterfaceDescriptorTotalLength        = size;
+      mid.InterfaceDescriptorDataStartAddress   = idd_offset;
+   }
+
+   blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.SIMDSize                     = dispatch.simd_size / 16;
+      ggw.ThreadDepthCounterMaximum    = 0;
+      ggw.ThreadHeightCounterMaximum   = 0;
+      ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
+      ggw.ThreadGroupIDStartingX       = group_x0;
+      ggw.ThreadGroupIDStartingY       = group_y0;
+#if GFX_VER >= 8
+      ggw.ThreadGroupIDStartingResumeZ = group_z0;
+#else
+      ggw.ThreadGroupIDStartingZ       = group_z0;
+#endif
+      ggw.ThreadGroupIDXDimension      = group_x1;
+      ggw.ThreadGroupIDYDimension      = group_y1;
+      ggw.ThreadGroupIDZDimension      = group_z1;
+      ggw.RightExecutionMask           = dispatch.right_mask;
+      ggw.BottomExecutionMask          = 0xffffffff;
+   }
+
+#else /* GFX_VER >= 7 */
+
+   unreachable("Compute blorp is not supported on SNB and earlier");
+
+#endif /* GFX_VER >= 7 */
+}
+
+
+/**
+ * \brief Execute a blit or render pass operation.
+ *
+ * To execute the operation, this function manually constructs and emits a
+ * batch to draw a rectangle primitive. The batchbuffer is flushed before
+ * constructing and after emitting the batch.
+ *
+ * This function alters no GL state.
+ */
+static void
+blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
+{
+   if (batch->flags & BLORP_BATCH_USE_COMPUTE)
+      blorp_exec_compute(batch, params);
+   else
+      blorp_exec_3d(batch, params);
+}
+
 #endif /* BLORP_GENX_EXEC_H */