winsys/amdgpu,pb_slab: add slabs with 3/4 of power of two sizes to save memory

Instead of aligning slab allocations to powers of two (e.g. 129K -> 256K), implement slab allocations with 3/4 of power of two sizes to reduce overallocation. (e.g. 129K -> 192K) The limitation is that the alignment must be 1/3rd of the allocation size. DeusExMD allocates 2.1 GB of VRAM. Without this, slabs waste 194 MB due to alignment, i.e. 9.2%. This commit reduces the waste to 102 MB, i.e. 4.9%. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8683>
2021-01-23 21:53:30 -05:00 · 2021-01-23 21:53:30 -05:00 · e97af11ba9
parent 35005881bf
commit e97af11ba9
6 changed files with 84 additions and 13 deletions
--- a/src/gallium/auxiliary/pipebuffer/pb_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_slab.c
@ -102,11 +102,22 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
   struct pb_slab_group *group;
   struct pb_slab *slab;
   struct pb_slab_entry *entry;
+   unsigned entry_size = 1 << order;
+   bool three_fourths = false;
+
+   /* If the size is <= 3/4 of the entry size, use a slab with entries using
+    * 3/4 sizes to reduce overallocation.
+    */
+   if (slabs->allow_three_fourths_allocations && size <= entry_size * 3 / 4) {
+      entry_size = entry_size * 3 / 4;
+      three_fourths = true;
+   }

   assert(order < slabs->min_order + slabs->num_orders);
   assert(heap < slabs->num_heaps);

-   group_index = heap * slabs->num_orders + (order - slabs->min_order);
+   group_index = (heap * slabs->num_orders + (order - slabs->min_order)) *
+                 (1 + slabs->allow_three_fourths_allocations) + three_fourths;
   group = &slabs->groups[group_index];

   mtx_lock(&slabs->mutex);
@ -136,7 +147,7 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
       * slabs for the same group, but that doesn't hurt correctness.
       */
      mtx_unlock(&slabs->mutex);
-      slab = slabs->slab_alloc(slabs->priv, heap, 1 << order, group_index);
+      slab = slabs->slab_alloc(slabs->priv, heap, entry_size, group_index);
      if (!slab)
         return NULL;
      mtx_lock(&slabs->mutex);
@ -191,7 +202,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs)
 bool
 pb_slabs_init(struct pb_slabs *slabs,
              unsigned min_order, unsigned max_order,
-              unsigned num_heaps,
+              unsigned num_heaps, bool allow_three_fourth_allocations,
              void *priv,
              slab_can_reclaim_fn *can_reclaim,
              slab_alloc_fn *slab_alloc,
@ -206,6 +217,7 @@ pb_slabs_init(struct pb_slabs *slabs,
   slabs->min_order = min_order;
   slabs->num_orders = max_order - min_order + 1;
   slabs->num_heaps = num_heaps;
+   slabs->allow_three_fourths_allocations = allow_three_fourth_allocations;

   slabs->priv = priv;
   slabs->can_reclaim = can_reclaim;
@ -214,7 +226,8 @@ pb_slabs_init(struct pb_slabs *slabs,

   list_inithead(&slabs->reclaim);

-   num_groups = slabs->num_orders * slabs->num_heaps;
+   num_groups = slabs->num_orders * slabs->num_heaps *
+                (1 + allow_three_fourth_allocations);
   slabs->groups = CALLOC(num_groups, sizeof(*slabs->groups));
   if (!slabs->groups)
      return false;
--- a/src/gallium/auxiliary/pipebuffer/pb_slab.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_slab.h
@ -116,8 +116,9 @@ struct pb_slabs
   unsigned min_order;
   unsigned num_orders;
   unsigned num_heaps;
+   bool allow_three_fourths_allocations;

-   /* One group per (heap, order) pair. */
+   /* One group per (heap, order, three_fourth_allocations). */
   struct pb_slab_group *groups;

   /* List of entries waiting to be reclaimed, i.e. they have been passed to
@ -144,7 +145,7 @@ pb_slabs_reclaim(struct pb_slabs *slabs);
 bool
 pb_slabs_init(struct pb_slabs *slabs,
              unsigned min_order, unsigned max_order,
-              unsigned num_heaps,
+              unsigned num_heaps, bool allow_three_fourth_allocations,
              void *priv,
              slab_can_reclaim_fn *can_reclaim,
              slab_alloc_fn *slab_alloc,
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@ -682,7 +682,8 @@ static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
   /* other functions are never called */
 };

-static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size)
+/* Return the power of two size of a slab entry matching the input size. */
+static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)
 {
   unsigned entry_size = util_next_power_of_two(size);
   unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;
@ -690,6 +691,17 @@ static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size
   return MAX2(entry_size, min_entry_size);
 }

+/* Return the slab entry alignment. */
+static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size)
+{
+   unsigned entry_size = get_slab_pot_entry_size(ws, size);
+
+   if (size <= entry_size * 3 / 4)
+      return entry_size / 4;
+
+   return entry_size;
+}
+
 static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
                                            unsigned entry_size,
                                            unsigned group_index,
@ -719,6 +731,21 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
         /* The slab size is twice the size of the largest possible entry. */
         slab_size = max_entry_size * 2;

+         if (!util_is_power_of_two_nonzero(entry_size)) {
+            assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
+
+            /* If the entry size is 3/4 of a power of two, we would waste space and not gain
+             * anything if we allocated only twice the power of two for the backing buffer:
+             *   2 * 3/4 = 1.5 usable with buffer size 2
+             *
+             * Allocating 5 times the entry size leads us to the next power of two and results
+             * in a much better memory utilization:
+             *   5 * 3/4 = 3.75 usable with buffer size 4
+             */
+            if (entry_size * 5 > slab_size)
+               slab_size = util_next_power_of_two(entry_size * 5);
+         }
+
         /* The largest slab should have the same size as the PTE fragment
          * size to get faster address translation.
          */
@ -736,8 +763,11 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
   if (!slab->buffer)
      goto fail;

-   slab->base.num_entries = slab->buffer->base.size / entry_size;
+   slab_size = slab->buffer->base.size;
+
+   slab->base.num_entries = slab_size / entry_size;
   slab->base.num_free = slab->base.num_entries;
+   slab->entry_size = entry_size;
   slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
   if (!slab->entries)
      goto fail_buffer;
@ -773,6 +803,13 @@ static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
      list_addtail(&bo->u.slab.entry.head, &slab->base.free);
   }

+   /* Wasted alignment due to slabs with 3/4 allocations being aligned to a power of two. */
+   assert(slab->base.num_entries * entry_size <= slab_size);
+   if (domains & RADEON_DOMAIN_VRAM)
+      ws->slab_wasted_vram += slab_size - slab->base.num_entries * entry_size;
+   else
+      ws->slab_wasted_gtt += slab_size - slab->base.num_entries * entry_size;
+
   return &slab->base;

 fail_buffer:
@ -799,6 +836,14 @@ struct pb_slab *amdgpu_bo_slab_alloc_normal(void *priv, unsigned heap,
 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
 {
   struct amdgpu_slab *slab = amdgpu_slab(pslab);
+   struct amdgpu_winsys *ws = slab->buffer->ws;
+   unsigned slab_size = slab->buffer->base.size;
+
+   assert(slab->base.num_entries * slab->entry_size <= slab_size);
+   if (slab->buffer->base.placement & RADEON_DOMAIN_VRAM)
+      ws->slab_wasted_vram -= slab_size - slab->base.num_entries * slab->entry_size;
+   else
+      ws->slab_wasted_gtt -= slab_size - slab->base.num_entries * slab->entry_size;

   for (unsigned i = 0; i < slab->base.num_entries; ++i) {
      amdgpu_bo_remove_fences(&slab->entries[i]);
@ -1347,8 +1392,19 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
      if (size < alignment && alignment <= 4 * 1024)
         alloc_size = alignment;

-      if (alignment > get_slab_entry_alignment(ws, size))
-         goto no_slab; /* can't fulfil alignment requirements */
+      if (alignment > get_slab_entry_alignment(ws, alloc_size)) {
+         /* 3/4 allocations can return too small alignment. Try again with a power of two
+          * allocation size.
+          */
+         unsigned pot_size = get_slab_pot_entry_size(ws, alloc_size);
+
+         if (alignment <= pot_size) {
+            /* This size works but wastes some memory to fulfil the alignment. */
+            alloc_size = pot_size;
+         } else {
+            goto no_slab; /* can't fulfil alignment requirements */
+         }
+      }

      struct pb_slabs *slabs = get_slabs(ws, alloc_size, flags);
      entry = pb_slab_alloc(slabs, alloc_size, heap);
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@ -115,6 +115,7 @@ struct amdgpu_winsys_bo {

 struct amdgpu_slab {
   struct pb_slab base;
+   unsigned entry_size;
   struct amdgpu_winsys_bo *buffer;
   struct amdgpu_winsys_bo *entries;
 };
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@ -458,7 +458,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,

         if (!pb_slabs_init(&aws->bo_slabs[i],
                            min_order, max_order,
-                            RADEON_MAX_SLAB_HEAPS,
+                            RADEON_MAX_SLAB_HEAPS, true,
                            aws,
                            amdgpu_bo_can_reclaim_slab,
                            amdgpu_bo_slab_alloc_normal,
@ -471,7 +471,7 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
         if (aws->info.has_tmz_support &&
             !pb_slabs_init(&aws->bo_slabs_encrypted[i],
                            min_order, max_order,
-                            RADEON_MAX_SLAB_HEAPS,
+                            RADEON_MAX_SLAB_HEAPS, true,
                            aws,
                            amdgpu_bo_can_reclaim_slab,
                            amdgpu_bo_slab_alloc_encrypted,
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@ -861,7 +861,7 @@ radeon_drm_winsys_create(int fd, const struct pipe_screen_config *config,
       */
      if (!pb_slabs_init(&ws->bo_slabs,
                         RADEON_SLAB_MIN_SIZE_LOG2, RADEON_SLAB_MAX_SIZE_LOG2,
-                         RADEON_MAX_SLAB_HEAPS,
+                         RADEON_MAX_SLAB_HEAPS, false,
                         ws,
                         radeon_bo_can_reclaim_slab,
                         radeon_bo_slab_alloc,