i965: Allow 48-bit addressing on Gen8+.

This allows most GPU objects to use the full 48-bit address space offered by Gen8+ platforms, rather than being stuck with 32-bit. This expands the available GPU memory from 4G to 256TB or so. A few objects - instruction, scratch, and vertex buffers - need to remain pinned in the low 4GB of the address space for various reasons. We default everything to 48-bit but disable it in those cases. Thanks to Jason Ekstrand for blazing this trail in anv first and finding the nasty undocumented hardware issues. This patch simply rips off all of his findings. Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2018-02-12 07:18:29 -08:00 · 2018-02-12 07:18:29 -08:00 · cee9f38903
parent 6712611735
commit cee9f38903
7 changed files with 127 additions and 18 deletions
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@ -119,6 +119,7 @@ struct brw_bufmgr {
   bool has_llc:1;
   bool has_mmap_wc:1;
   bool bo_reuse:1;
+   bool supports_48b_addresses:1;
 };

 static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
@ -409,6 +410,8 @@ retry:
   bo->reusable = true;
   bo->cache_coherent = bufmgr->has_llc;
   bo->index = -1;
+   if (bufmgr->supports_48b_addresses)
+      bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;

   mtx_unlock(&bufmgr->lock);

@ -1385,6 +1388,24 @@ gem_param(int fd, int name)
   return v;
 }

+static bool
+gem_supports_48b_addresses(int fd)
+{
+   struct drm_i915_gem_exec_object2 obj = {
+      .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+   };
+
+   struct drm_i915_gem_execbuffer2 execbuf = {
+      .buffers_ptr = (uintptr_t)&obj,
+      .buffer_count = 1,
+      .rsvd1 = 0xffffffu,
+   };
+
+   int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
+
+   return ret == -1 && errno == ENOENT;
+}
+
 /**
 * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
 * and manage map buffer objections.
@ -1418,6 +1439,8 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)

   bufmgr->has_llc = devinfo->has_llc;
   bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
+   bufmgr->supports_48b_addresses =
+      devinfo->gen >= 8 && gem_supports_48b_addresses(fd);

   init_cache_buckets(bufmgr);

--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@ -634,6 +634,12 @@ brw_upload_state_base_address(struct brw_context *brw)
   }

   if (devinfo->gen >= 8) {
+      /* STATE_BASE_ADDRESS has issues with 48-bit address spaces.  If the
+       * address + size as seen by STATE_BASE_ADDRESS overflows 48 bits,
+       * the GPU appears to treat all accesses to the buffer as being out
+       * of bounds and returns zero.  To work around this, we pin all SBAs
+       * to the bottom 4GB.
+       */
      uint32_t mocs_wb = devinfo->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
      int pkt_len = devinfo->gen >= 9 ? 19 : 16;

@ -644,15 +650,14 @@ brw_upload_state_base_address(struct brw_context *brw)
      OUT_BATCH(0);
      OUT_BATCH(mocs_wb << 16);
      /* Surface state base address: */
-      OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
+      OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
      /* Dynamic state base address: */
-      OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
+      OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
      /* Indirect object base address: MEDIA_OBJECT data */
      OUT_BATCH(mocs_wb << 4 | 1);
      OUT_BATCH(0);
      /* Instruction base address: shader kernels (incl. SIP) */
-      OUT_RELOC64(brw->cache.bo, 0, mocs_wb << 4 | 1);
-
+      OUT_RELOC64(brw->cache.bo, RELOC_32BIT, mocs_wb << 4 | 1);
      /* General state buffer size */
      OUT_BATCH(0xfffff001);
      /* Dynamic state buffer size */
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@ -205,12 +205,23 @@ brw_emit_surface_state(struct brw_context *brw,
       * FIXME: move to the point of assignment.
       */
      assert((aux_offset & 0xfff) == 0);
-      uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
-      *aux_addr = brw_state_reloc(&brw->batch,
-                                  *surf_offset +
-                                  brw->isl_dev.ss.aux_addr_offset,
-                                  aux_bo, *aux_addr,
-                                  reloc_flags);
+
+      if (devinfo->gen >= 8) {
+         uint64_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
+         *aux_addr = brw_state_reloc(&brw->batch,
+                                     *surf_offset +
+                                     brw->isl_dev.ss.aux_addr_offset,
+                                     aux_bo, *aux_addr,
+                                     reloc_flags);
+      } else {
+         uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
+         *aux_addr = brw_state_reloc(&brw->batch,
+                                     *surf_offset +
+                                     brw->isl_dev.ss.aux_addr_offset,
+                                     aux_bo, *aux_addr,
+                                     reloc_flags);
+
+      }
   }
 }

--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@ -166,6 +166,15 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
      .buffer = brw->batch.state.bo,
      .offset = offset,

+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two vertex buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict vertex buffers to the low 32 bits of
+       * the address space.
+       */
+      .reloc_flags = RELOC_32BIT,
+
 #if GEN_GEN == 10
      .mocs = CNL_MOCS_WB,
 #elif GEN_GEN == 9
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@ -101,7 +101,7 @@ __gen_combine_address(struct brw_context *brw, void *location,
   }
 }

-static struct brw_address
+UNUSED static struct brw_address
 rw_bo(struct brw_bo *bo, uint32_t offset)
 {
   return (struct brw_address) {
@ -120,6 +120,26 @@ ro_bo(struct brw_bo *bo, uint32_t offset)
   };
 }

+static struct brw_address
+rw_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_WRITE | RELOC_32BIT,
+   };
+}
+
+static struct brw_address
+ro_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_32BIT,
+   };
+}
+
 UNUSED static struct brw_address
 ggtt_bo(struct brw_bo *bo, uint32_t offset)
 {
@ -317,7 +337,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
   struct GENX(VERTEX_BUFFER_STATE) buf_state = {
      .VertexBufferIndex = buffer_nr,
      .BufferPitch = stride,
-      .BufferStartingAddress = ro_bo(bo, start_offset),
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two vertex buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict vertex buffers to the low 32 bits of
+       * the address space.
+       */
+      .BufferStartingAddress = ro_32_bo(bo, start_offset),
 #if GEN_GEN >= 8
      .BufferSize = end_offset - start_offset,
 #endif
@ -858,7 +886,15 @@ genX(emit_index_buffer)(struct brw_context *brw)
      ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 #endif
      ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
-      ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two index buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict index buffers to the low 32 bits of
+       * the address space.
+       */
+      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
 #if GEN_GEN >= 8
      ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
      ib.BufferSize = brw->ib.size;
@ -1895,7 +1931,7 @@ genX(upload_wm)(struct brw_context *brw)
 #endif

      if (wm_prog_data->base.total_scratch) {
-         wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
         wm.PerThreadScratchSpace =
            ffs(stage_state->per_thread_scratch) - 11;
      }
@ -2014,6 +2050,14 @@ static const struct brw_tracked_state genX(wm_state) = {

 /* ---------------------------------------------------------------------- */

+/* We restrict scratch buffers to the bottom 32 bits of the address space
+ * by using rw_32_bo().
+ *
+ * General State Base Address is a bit broken.  If the address + size as
+ * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
+ * all accesses to the buffer as being out of bounds and returns zero.
+ */
+
 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
   pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
   pkt.SamplerCount       =                                               \
@ -2023,7 +2067,7 @@ static const struct brw_tracked_state genX(wm_state) = {
   pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
                                                                          \
   if (stage_prog_data->total_scratch) {                                  \
-      pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
+      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
      pkt.PerThreadScratchSpace =                                         \
         ffs(stage_state->per_thread_scratch) - 11;                       \
   }                                                                      \
@ -3894,8 +3938,8 @@ genX(upload_ps)(struct brw_context *brw)

      if (prog_data->base.total_scratch) {
         ps.ScratchSpaceBasePointer =
-            rw_bo(stage_state->scratch_bo,
-                  ffs(stage_state->per_thread_scratch) - 11);
+            rw_32_bo(stage_state->scratch_bo,
+                     ffs(stage_state->per_thread_scratch) - 11);
      }
   }
 }
@ -4216,7 +4260,7 @@ genX(upload_cs_state)(struct brw_context *brw)
             */
            per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
         }
-         vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
         vfe.PerThreadScratchSpace = per_thread_scratch_value;
      }

--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@ -1095,6 +1095,21 @@ emit_reloc(struct intel_batchbuffer *batch,
   unsigned int index = add_exec_bo(batch, target);
   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];

+   if (reloc_flags & RELOC_32BIT) {
+      /* Restrict this buffer to the low 32 bits of the address space.
+       *
+       * Altering the validation list flags restricts it for this batch,
+       * but we also alter the BO's kflags to restrict it permanently
+       * (until the BO is destroyed and put back in the cache).  Buffers
+       * may stay bound across batches, and we want keep it constrained.
+       */
+      target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+      entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+      /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
+      reloc_flags &= ~RELOC_32BIT;
+   }
+
   if (reloc_flags)
      entry->flags |= reloc_flags & batch->valid_reloc_flags;

--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@ -53,6 +53,8 @@ bool brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo);

 #define RELOC_WRITE EXEC_OBJECT_WRITE
 #define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
+/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
+#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
 uint64_t brw_batch_reloc(struct intel_batchbuffer *batch,
                         uint32_t batch_offset,
                         struct brw_bo *target,