From ce2e2296ab61558f02ea2d05ae0cf4a922df84a7 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sat, 7 Aug 2021 23:00:44 -0700
Subject: [PATCH] iris: Suballocate BO using the Gallium pb_slab mechanism

With all the preparation in place to handle suballocated BOs at
submission and export, we can now wire up the actual suballocator.
We use Gallium's pb_slab infrastructure for this, which is already
used for this purpose in the amdgpu winsys and now zink as well.

Unlike those drivers, we don't use pb_buffer (it doesn't do much) nor
pb_cache (we already have a buffer cache).  Just pb_slab for now.

We can now suballocate BOs at power-of-two (or 3/4 power-of-two)
granularity, between 256B and 2MB.  Beyond that, we use actual GEM
objects as before.  This should save us some memory on current GPUs
where we previously had a minimum allocation granularity of 4K (page
size), but should save us a /ton/ of memory on future GPUs where the
minimum page size is 64K.  Fewer actual GEM objects should also mean
shorter exec_object2 lists passed to the kernel, which could reduce
CPU overhead a bit.  Using large allocations where the underlying
GEM objects correspond with the PTE fragment size may also allow the
kernel to use a more efficient page table layout, improving memory
access times.

This cuts nearly half of the memory usage in a Unity3D demo on a
GPU that uses 64K pages.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4722

Acked-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12623>
---
 src/gallium/drivers/iris/iris_batch.c  |   2 +
 src/gallium/drivers/iris/iris_bufmgr.c | 344 ++++++++++++++++++++++++-
 src/gallium/drivers/iris/iris_bufmgr.h |   2 +
 3 files changed, 342 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
index 020db7eebc5..fac2faf4f41 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -836,6 +836,8 @@ submit_batch(struct iris_batch *batch)
       bo->idle = false;
       bo->index = -1;
 
+      iris_get_backing_bo(bo)->idle = false;
+
       iris_bo_unreference(bo);
    }
 
diff --git a/src/gallium/drivers/iris/iris_bufmgr.c b/src/gallium/drivers/iris/iris_bufmgr.c
index 449339715e6..7a9d1a4820d 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@@ -170,6 +170,26 @@ struct iris_memregion {
    uint64_t size;
 };
 
+#define NUM_SLAB_ALLOCATORS 3
+
+enum iris_heap {
+   IRIS_HEAP_SYSTEM_MEMORY,
+   IRIS_HEAP_DEVICE_LOCAL,
+   IRIS_HEAP_MAX,
+};
+
+struct iris_slab {
+   struct pb_slab base;
+
+   unsigned entry_size;
+
+   /** The BO representing the entire slab */
+   struct iris_bo *bo;
+
+   /** Array of iris_bo structs representing BOs allocated out of this slab */
+   struct iris_bo *entries;
+};
+
 struct iris_bufmgr {
    /**
     * List into the list of bufmgr.
@@ -217,6 +237,8 @@ struct iris_bufmgr {
    bool bo_reuse:1;
 
    struct intel_aux_map_context *aux_map_ctx;
+
+   struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS];
 };
 
 static simple_mtx_t global_bufmgr_list_mutex = _SIMPLE_MTX_INITIALIZER_NP;
@@ -520,6 +542,277 @@ bo_unmap(struct iris_bo *bo)
    bo->real.map = NULL;
 }
 
+static struct pb_slabs *
+get_slabs(struct iris_bufmgr *bufmgr, uint64_t size)
+{
+   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      struct pb_slabs *slabs = &bufmgr->bo_slabs[i];
+
+      if (size <= 1ull << (slabs->min_order + slabs->num_orders - 1))
+         return slabs;
+   }
+
+   unreachable("should have found a valid slab for this size");
+}
+
+/* Return the power of two size of a slab entry matching the input size. */
+static unsigned
+get_slab_pot_entry_size(struct iris_bufmgr *bufmgr, unsigned size)
+{
+   unsigned entry_size = util_next_power_of_two(size);
+   unsigned min_entry_size = 1 << bufmgr->bo_slabs[0].min_order;
+
+   return MAX2(entry_size, min_entry_size);
+}
+
+/* Return the slab entry alignment. */
+static unsigned
+get_slab_entry_alignment(struct iris_bufmgr *bufmgr, unsigned size)
+{
+   unsigned entry_size = get_slab_pot_entry_size(bufmgr, size);
+
+   if (size <= entry_size * 3 / 4)
+      return entry_size / 4;
+
+   return entry_size;
+}
+
+static bool
+iris_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
+{
+   struct iris_bo *bo = container_of(entry, struct iris_bo, slab.entry);
+
+   return !iris_bo_busy(bo);
+}
+
+static void
+iris_slab_free(void *priv, struct pb_slab *pslab)
+{
+   struct iris_bufmgr *bufmgr = priv;
+   struct iris_slab *slab = (void *) pslab;
+   struct intel_aux_map_context *aux_map_ctx = bufmgr->aux_map_ctx;
+
+   assert(!slab->bo->aux_map_address);
+
+   if (aux_map_ctx) {
+      /* Since we're freeing the whole slab, all buffers allocated out of it
+       * must be reclaimable.  We require buffers to be idle to be reclaimed
+       * (see iris_can_reclaim_slab()), so we know all entries must be idle.
+       * Therefore, we can safely unmap their aux table entries.
+       */
+      for (unsigned i = 0; i < pslab->num_entries; i++) {
+         struct iris_bo *bo = &slab->entries[i];
+         if (bo->aux_map_address) {
+            intel_aux_map_unmap_range(aux_map_ctx, bo->address, bo->size);
+            bo->aux_map_address = 0;
+         }
+      }
+   }
+
+   iris_bo_unreference(slab->bo);
+
+   free(slab->entries);
+   free(slab);
+}
+
+static struct pb_slab *
+iris_slab_alloc(void *priv,
+                unsigned heap,
+                unsigned entry_size,
+                unsigned group_index)
+{
+   struct iris_bufmgr *bufmgr = priv;
+   struct iris_slab *slab = calloc(1, sizeof(struct iris_slab));
+   unsigned flags = heap == IRIS_HEAP_SYSTEM_MEMORY ? BO_ALLOC_SMEM : 0;
+   unsigned slab_size = 0;
+   /* We only support slab allocation for IRIS_MEMZONE_OTHER */
+   enum iris_memory_zone memzone = IRIS_MEMZONE_OTHER;
+
+   if (!slab)
+      return NULL;
+
+   struct pb_slabs *slabs = bufmgr->bo_slabs;
+
+   /* Determine the slab buffer size. */
+   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      unsigned max_entry_size =
+         1 << (slabs[i].min_order + slabs[i].num_orders - 1);
+
+      if (entry_size <= max_entry_size) {
+         /* The slab size is twice the size of the largest possible entry. */
+         slab_size = max_entry_size * 2;
+
+         if (!util_is_power_of_two_nonzero(entry_size)) {
+            assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
+
+            /* If the entry size is 3/4 of a power of two, we would waste
+             * space and not gain anything if we allocated only twice the
+             * power of two for the backing buffer:
+             *
+             *    2 * 3/4 = 1.5 usable with buffer size 2
+             *
+             * Allocating 5 times the entry size leads us to the next power
+             * of two and results in a much better memory utilization:
+             *
+             *    5 * 3/4 = 3.75 usable with buffer size 4
+             */
+            if (entry_size * 5 > slab_size)
+               slab_size = util_next_power_of_two(entry_size * 5);
+         }
+
+         /* The largest slab should have the same size as the PTE fragment
+          * size to get faster address translation.
+          *
+          * TODO: move this to intel_device_info?
+          */
+         const unsigned pte_size = 2 * 1024 * 1024;
+
+         if (i == NUM_SLAB_ALLOCATORS - 1 && slab_size < pte_size)
+            slab_size = pte_size;
+
+         break;
+      }
+   }
+   assert(slab_size != 0);
+
+   slab->bo =
+      iris_bo_alloc(bufmgr, "slab", slab_size, slab_size, memzone, flags);
+   if (!slab->bo)
+      goto fail;
+
+   slab_size = slab->bo->size;
+
+   slab->base.num_entries = slab_size / entry_size;
+   slab->base.num_free = slab->base.num_entries;
+   slab->entry_size = entry_size;
+   slab->entries = calloc(slab->base.num_entries, sizeof(*slab->entries));
+   if (!slab->entries)
+      goto fail_bo;
+
+   list_inithead(&slab->base.free);
+
+   for (unsigned i = 0; i < slab->base.num_entries; i++) {
+      struct iris_bo *bo = &slab->entries[i];
+
+      bo->size = entry_size;
+      bo->bufmgr = bufmgr;
+      bo->hash = _mesa_hash_pointer(bo);
+      bo->gem_handle = 0;
+      bo->address = slab->bo->address + i * entry_size;
+      bo->aux_map_address = 0;
+      bo->index = -1;
+      bo->refcount = 0;
+      bo->idle = true;
+
+      bo->slab.entry.slab = &slab->base;
+      bo->slab.entry.group_index = group_index;
+      bo->slab.entry.entry_size = entry_size;
+
+      bo->slab.real = iris_get_backing_bo(slab->bo);
+
+      list_addtail(&bo->slab.entry.head, &slab->base.free);
+   }
+
+   return &slab->base;
+
+fail_bo:
+   iris_bo_unreference(slab->bo);
+fail:
+   free(slab);
+   return NULL;
+}
+
+static struct iris_bo *
+alloc_bo_from_slabs(struct iris_bufmgr *bufmgr,
+                    const char *name,
+                    uint64_t size,
+                    uint32_t alignment,
+                    unsigned flags,
+                    bool local)
+{
+   if (flags & BO_ALLOC_NO_SUBALLOC)
+      return NULL;
+
+   struct pb_slabs *last_slab = &bufmgr->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
+   unsigned max_slab_entry_size =
+      1 << (last_slab->min_order + last_slab->num_orders - 1);
+
+   if (size > max_slab_entry_size)
+      return NULL;
+
+   struct pb_slab_entry *entry;
+
+   enum iris_heap heap =
+      local ? IRIS_HEAP_DEVICE_LOCAL : IRIS_HEAP_SYSTEM_MEMORY;
+
+   unsigned alloc_size = size;
+
+   /* Always use slabs for sizes less than 4 KB because the kernel aligns
+    * everything to 4 KB.
+    */
+   if (size < alignment && alignment <= 4 * 1024)
+      alloc_size = alignment;
+
+   if (alignment > get_slab_entry_alignment(bufmgr, alloc_size)) {
+      /* 3/4 allocations can return too small alignment.
+       * Try again with a power of two allocation size.
+       */
+      unsigned pot_size = get_slab_pot_entry_size(bufmgr, alloc_size);
+
+      if (alignment <= pot_size) {
+         /* This size works but wastes some memory to fulfill the alignment. */
+         alloc_size = pot_size;
+      } else {
+         /* can't fulfill alignment requirements */
+         return NULL;
+      }
+   }
+
+   struct pb_slabs *slabs = get_slabs(bufmgr, alloc_size);
+   entry = pb_slab_alloc(slabs, alloc_size, heap);
+   if (!entry) {
+      /* Clean up and try again... */
+      pb_slabs_reclaim(slabs);
+
+      entry = pb_slab_alloc(slabs, alloc_size, heap);
+   }
+   if (!entry)
+      return NULL;
+
+   struct iris_bo *bo = container_of(entry, struct iris_bo, slab.entry);
+
+   if (bo->aux_map_address && bo->bufmgr->aux_map_ctx) {
+      /* This buffer was associated with an aux-buffer range.  We only allow
+       * slab allocated buffers to be reclaimed when idle (not in use by an
+       * executing batch).  (See iris_can_reclaim_slab().)  So we know that
+       * our previous aux mapping is no longer in use, and we can safely
+       * remove it.
+       */
+      intel_aux_map_unmap_range(bo->bufmgr->aux_map_ctx, bo->address,
+                                bo->size);
+      bo->aux_map_address = 0;
+   }
+
+   p_atomic_set(&bo->refcount, 1);
+   bo->name = name;
+   bo->size = size;
+
+   /* Zero the contents if necessary.  If this fails, fall back to
+    * allocating a fresh BO, which will always be zeroed by the kernel.
+    */
+   if (flags & BO_ALLOC_ZEROED) {
+      void *map = iris_bo_map(NULL, bo, MAP_WRITE | MAP_RAW);
+      if (map) {
+         memset(map, 0, bo->size);
+      } else {
+         pb_slab_free(slabs, &bo->slab.entry);
+         return NULL;
+      }
+   }
+
+   return bo;
+}
+
 static struct iris_bo *
 alloc_bo_from_cache(struct iris_bufmgr *bufmgr,
                     struct bo_cache_bucket *bucket,
@@ -701,6 +994,14 @@ iris_bo_alloc(struct iris_bufmgr *bufmgr,
       !(flags & BO_ALLOC_COHERENT || flags & BO_ALLOC_SMEM);
    struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size, local);
 
+   if (memzone != IRIS_MEMZONE_OTHER || (flags & BO_ALLOC_COHERENT))
+      flags |= BO_ALLOC_NO_SUBALLOC;
+
+   bo = alloc_bo_from_slabs(bufmgr, name, size, alignment, flags, local);
+
+   if (bo)
+      return bo;
+
    /* Round the size up to the bucket size, or if we don't have caching
     * at this size, a multiple of the page size.
     */
@@ -1077,14 +1378,18 @@ iris_bo_unreference(struct iris_bo *bo)
 
       clock_gettime(CLOCK_MONOTONIC, &time);
 
-      simple_mtx_lock(&bufmgr->lock);
+      if (bo->gem_handle == 0) {
+         pb_slab_free(get_slabs(bufmgr, bo->size), &bo->slab.entry);
+      } else {
+         simple_mtx_lock(&bufmgr->lock);
 
-      if (p_atomic_dec_zero(&bo->refcount)) {
-         bo_unreference_final(bo, time.tv_sec);
-         cleanup_bo_cache(bufmgr, time.tv_sec);
+         if (p_atomic_dec_zero(&bo->refcount)) {
+            bo_unreference_final(bo, time.tv_sec);
+            cleanup_bo_cache(bufmgr, time.tv_sec);
+         }
+
+         simple_mtx_unlock(&bufmgr->lock);
       }
-
-      simple_mtx_unlock(&bufmgr->lock);
    }
 }
 
@@ -1340,6 +1645,11 @@ iris_bufmgr_destroy(struct iris_bufmgr *bufmgr)
    /* bufmgr will no longer try to free VMA entries in the aux-map */
    bufmgr->aux_map_ctx = NULL;
 
+   for (int i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      if (bufmgr->bo_slabs[i].groups)
+         pb_slabs_deinit(&bufmgr->bo_slabs[i]);
+   }
+
    simple_mtx_destroy(&bufmgr->lock);
    simple_mtx_destroy(&bufmgr->bo_deps_lock);
 
@@ -1987,6 +2297,28 @@ iris_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
    init_cache_buckets(bufmgr, false);
    init_cache_buckets(bufmgr, true);
 
+   unsigned min_slab_order = 8;  /* 256 bytes */
+   unsigned max_slab_order = 20; /* 1 MB (slab size = 2 MB) */
+   unsigned num_slab_orders_per_allocator =
+      (max_slab_order - min_slab_order) / NUM_SLAB_ALLOCATORS;
+
+   /* Divide the size order range among slab managers. */
+   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      unsigned min_order = min_slab_order;
+      unsigned max_order =
+         MIN2(min_order + num_slab_orders_per_allocator, max_slab_order);
+
+      if (!pb_slabs_init(&bufmgr->bo_slabs[i], min_order, max_order,
+                         IRIS_HEAP_MAX, true, bufmgr,
+                         iris_can_reclaim_slab,
+                         iris_slab_alloc,
+                         (void *) iris_slab_free)) {
+         free(bufmgr);
+         return NULL;
+      }
+      min_slab_order = max_order + 1;
+   }
+
    bufmgr->name_table =
       _mesa_hash_table_create(NULL, _mesa_hash_uint, _mesa_key_uint_equal);
    bufmgr->handle_table =
diff --git a/src/gallium/drivers/iris/iris_bufmgr.h b/src/gallium/drivers/iris/iris_bufmgr.h
index f2f20407687..010ce424313 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.h
+++ b/src/gallium/drivers/iris/iris_bufmgr.h
@@ -35,6 +35,7 @@
 #include "util/list.h"
 #include "util/simple_mtx.h"
 #include "pipe/p_defines.h"
+#include "pipebuffer/pb_slab.h"
 
 struct intel_device_info;
 struct pipe_debug_callback;
@@ -259,6 +260,7 @@ struct iris_bo {
          bool local;
       } real;
       struct {
+         struct pb_slab_entry entry;
          struct iris_bo *real;
       } slab;
    };