turnip: Sub-allocate pipelines out of a device-global BO pool.

Allocating a BO for each pipeline meant that for apps with many pipelines (such as Asphalt9 under ANGLE), we would end up spending too much time in the kernel tracking the BO references. Looking at CS:Source on zink, before we had 85 BOs for the pipelines for a total of 1036 kb, and now we have 7 BOs for a total of 896 kb. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15038>
2022-02-14 14:45:01 -08:00 · 2022-02-14 14:45:01 -08:00 · dc3203b087
parent e0fbdd3eda
commit dc3203b087
8 changed files with 291 additions and 31 deletions
--- a/src/freedreno/vulkan/meson.build
+++ b/src/freedreno/vulkan/meson.build
@ -48,6 +48,7 @@ libtu_files = files(
  'tu_private.h',
  'tu_query.c',
  'tu_shader.c',
+  'tu_suballoc.c',
  'tu_util.c',
  'tu_util.h',
  'tu_perfetto.h',
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@ -711,4 +711,4 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,

   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
   tu_cs_emit(cs, ZPASS_DONE);
-}
+}
--- a/src/freedreno/vulkan/tu_cs.c
+++ b/src/freedreno/vulkan/tu_cs.c
@ -56,6 +56,25 @@ tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
   cs->end = end;
 }

+/**
+ * Initialize a sub-command stream as a wrapper to an externally sub-allocated
+ * buffer.
+ */
+void
+tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device,
+                    struct tu_suballoc_bo *suballoc_bo)
+{
+   uint32_t *start = tu_suballoc_bo_map(suballoc_bo);
+   uint32_t *end = start + (suballoc_bo->size >> 2);
+
+   memset(cs, 0, sizeof(*cs));
+   cs->device = device;
+   cs->mode = TU_CS_MODE_SUB_STREAM;
+   cs->start = cs->reserved_end = cs->cur = start;
+   cs->end = end;
+   cs->refcount_bo = tu_bo_get_ref(suballoc_bo->bo);
+}
+
 /**
 * Finish and release all resources owned by a command stream.
 */
@ -66,10 +85,24 @@ tu_cs_finish(struct tu_cs *cs)
      tu_bo_finish(cs->device, cs->bos[i]);
   }

+   if (cs->refcount_bo)
+      tu_bo_finish(cs->device, cs->refcount_bo);
+
   free(cs->entries);
   free(cs->bos);
 }

+static struct tu_bo *
+tu_cs_current_bo(const struct tu_cs *cs)
+{
+   if (cs->refcount_bo) {
+      return cs->refcount_bo;
+   } else {
+      assert(cs->bo_count);
+      return cs->bos[cs->bo_count - 1];
+   }
+}
+
 /**
 * Get the offset of the command packets emitted since the last call to
 * tu_cs_add_entry.
@ -77,8 +110,7 @@ tu_cs_finish(struct tu_cs *cs)
 static uint32_t
 tu_cs_get_offset(const struct tu_cs *cs)
 {
-   assert(cs->bo_count);
-   return cs->start - (uint32_t *) cs->bos[cs->bo_count - 1]->map;
+   return cs->start - (uint32_t *) tu_cs_current_bo(cs)->map;
 }

 /*
@ -90,6 +122,8 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size)
 {
   /* no BO for TU_CS_MODE_EXTERNAL */
   assert(cs->mode != TU_CS_MODE_EXTERNAL);
+   /* No adding more BOs if suballocating from a suballoc_bo. */
+   assert(!cs->refcount_bo);

   /* no dangling command packet */
   assert(tu_cs_is_empty(cs));
@ -176,7 +210,7 @@ tu_cs_add_entry(struct tu_cs *cs)

   /* add an entry for [cs->start, cs->cur] */
   cs->entries[cs->entry_count++] = (struct tu_cs_entry) {
-      .bo = cs->bos[cs->bo_count - 1],
+      .bo = tu_cs_current_bo(cs),
      .size = tu_cs_get_size(cs) * sizeof(uint32_t),
      .offset = tu_cs_get_offset(cs) * sizeof(uint32_t),
   };
@ -281,7 +315,7 @@ tu_cs_alloc(struct tu_cs *cs,
   if (result != VK_SUCCESS)
      return result;

-   struct tu_bo *bo = cs->bos[cs->bo_count - 1];
+   struct tu_bo *bo = tu_cs_current_bo(cs);
   size_t offset = align(tu_cs_get_offset(cs), size);

   memory->map = bo->map + offset * sizeof(uint32_t);
@ -303,7 +337,6 @@ struct tu_cs_entry
 tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs)
 {
   assert(cs->mode == TU_CS_MODE_SUB_STREAM);
-   assert(cs->bo_count);
   assert(sub_cs->start == cs->cur && sub_cs->end == cs->reserved_end);
   tu_cs_sanity_check(sub_cs);

@ -312,7 +345,7 @@ tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs)
   cs->cur = sub_cs->cur;

   struct tu_cs_entry entry = {
-      .bo = cs->bos[cs->bo_count - 1],
+      .bo = tu_cs_current_bo(cs),
      .size = tu_cs_get_size(cs) * sizeof(uint32_t),
      .offset = tu_cs_get_offset(cs) * sizeof(uint32_t),
   };
@ -397,7 +430,7 @@ void
 tu_cs_reset(struct tu_cs *cs)
 {
   if (cs->mode == TU_CS_MODE_EXTERNAL) {
-      assert(!cs->bo_count && !cs->entry_count);
+      assert(!cs->bo_count && !cs->refcount_bo && !cs->entry_count);
      cs->reserved_end = cs->cur = cs->start;
      return;
   }
--- a/src/freedreno/vulkan/tu_cs.h
+++ b/src/freedreno/vulkan/tu_cs.h
@ -39,6 +39,10 @@ void
 tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
                    uint32_t *start, uint32_t *end);

+void
+tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device,
+                    struct tu_suballoc_bo *bo);
+
 void
 tu_cs_finish(struct tu_cs *cs);

--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@ -1728,6 +1728,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   device->vk.check_status = tu_device_check_status;

   mtx_init(&device->bo_mutex, mtx_plain);
+   mtx_init(&device->pipeline_mutex, mtx_plain);
   u_rwlock_init(&device->dma_bo_lock);
   pthread_mutex_init(&device->submit_mutex, NULL);

@ -1786,6 +1787,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   if (custom_border_colors)
      global_size += TU_BORDER_COLOR_COUNT * sizeof(struct bcolor_entry);

+   tu_bo_suballocator_init(&device->pipeline_suballoc, device,
+                           128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
+
   result = tu_bo_init_new(device, &device->global_bo, global_size,
                           TU_BO_ALLOC_ALLOW_DUMP);
   if (result != VK_SUCCESS) {
@ -1987,6 +1991,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)

   tu_autotune_fini(&device->autotune, device);

+   tu_bo_suballocator_finish(&device->pipeline_suballoc);
+
   util_sparse_array_finish(&device->bo_map);
   u_rwlock_destroy(&device->dma_bo_lock);

--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@ -2257,6 +2257,7 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
                        struct tu_pipeline *pipeline,
                        struct tu_pipeline_layout *layout,
                        struct tu_pipeline_builder *builder,
+                        struct tu_pipeline_cache *cache,
                        struct ir3_shader_variant *compute)
 {
   uint32_t size = 2048 + tu6_load_state_size(pipeline, layout, compute);
@ -2292,13 +2293,27 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
      size += tu_xs_get_additional_cs_size_dwords(compute);
   }

-   tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
+   /* Allocate the space for the pipeline out of the device's RO suballocator.
+    *
+    * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
+    * BOs at exec time.
+    *
+    * The pipeline cache would seem like a natural place to stick the
+    * suballocator, except that it is not guaranteed to outlive the pipelines
+    * created from it, so you can't store any long-lived state there, and you
+    * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
+    * pipeline destroy isn't synchronized by the cache.
+    */
+   pthread_mutex_lock(&dev->pipeline_mutex);
+   VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
+                                          size * 4, 128);
+   pthread_mutex_unlock(&dev->pipeline_mutex);
+   if (result != VK_SUCCESS)
+      return result;

-   /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note
-    * that LOAD_STATE can potentially take up a large amount of space so we
-    * calculate its size explicitly.
-   */
-   return tu_cs_reserve_space(&pipeline->cs, size);
+   tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
+
+   return VK_SUCCESS;
 }

 static void
@ -3259,6 +3274,9 @@ tu_pipeline_finish(struct tu_pipeline *pipeline,
                   const VkAllocationCallbacks *alloc)
 {
   tu_cs_finish(&pipeline->cs);
+   pthread_mutex_lock(&dev->pipeline_mutex);
+   tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
+   pthread_mutex_unlock(&dev->pipeline_mutex);

   if (pipeline->pvtmem_bo)
      tu_bo_finish(dev, pipeline->pvtmem_bo);
@ -3288,7 +3306,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
   }

   result = tu_pipeline_allocate_cs(builder->device, *pipeline,
-                                    builder->layout, builder, NULL);
+                                    builder->layout, builder, builder->cache, NULL);
   if (result != VK_SUCCESS) {
      vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
      return result;
@ -3340,11 +3358,6 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
   tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
   tu6_emit_load_state(*pipeline, builder->layout, false);

-   /* we should have reserved enough space upfront such that the CS never
-    * grows
-    */
-   assert((*pipeline)->cs.bo_count == 1);
-
   return VK_SUCCESS;
 }

@ -3492,12 +3505,13 @@ tu_CreateGraphicsPipelines(VkDevice device,

 static VkResult
 tu_compute_pipeline_create(VkDevice device,
-                           VkPipelineCache _cache,
+                           VkPipelineCache pipelineCache,
                           const VkComputePipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *pAllocator,
                           VkPipeline *pPipeline)
 {
   TU_FROM_HANDLE(tu_device, dev, device);
+   TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache);
   TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
   const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
   VkResult result;
@ -3545,7 +3559,7 @@ tu_compute_pipeline_create(VkDevice device,
   tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
                           shader, v);

-   result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v);
+   result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, cache, v);
   if (result != VK_SUCCESS)
      goto fail;

--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -297,6 +297,13 @@ bool
 tu_physical_device_extension_supported(struct tu_physical_device *dev,
                                       const char *name);

+enum tu_bo_alloc_flags
+{
+   TU_BO_ALLOC_NO_FLAGS = 0,
+   TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
+   TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
+};
+
 struct cache_entry;

 struct tu_pipeline_cache
@ -379,6 +386,47 @@ struct tu_bo
   bool implicit_sync : 1;
 };

+/* externally-synchronized BO suballocator. */
+struct tu_suballocator
+{
+   struct tu_device *dev;
+
+   uint32_t default_size;
+   enum tu_bo_alloc_flags flags;
+
+   /** Current BO we're suballocating out of. */
+   struct tu_bo *bo;
+   uint32_t next_offset;
+
+   /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
+   struct tu_bo *cached_bo;
+};
+
+struct tu_suballoc_bo
+{
+   struct tu_bo *bo;
+   uint64_t iova;
+   uint32_t size; /* bytes */
+};
+
+void
+tu_bo_suballocator_init(struct tu_suballocator *suballoc,
+                        struct tu_device *dev,
+                        uint32_t default_size,
+                        uint32_t flags);
+void
+tu_bo_suballocator_finish(struct tu_suballocator *suballoc);
+
+VkResult
+tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
+                     uint32_t size, uint32_t align);
+
+void *
+tu_suballoc_bo_map(struct tu_suballoc_bo *bo);
+
+void
+tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);
+
 enum global_shader {
   GLOBAL_SH_VS_BLIT,
   GLOBAL_SH_VS_CLEAR,
@ -460,6 +508,12 @@ struct tu_device

   uint32_t implicit_sync_bo_count;

+   /* Device-global BO suballocator for reducing BO management overhead for
+    * (read-only) pipeline state.  Synchronized by pipeline_mutex.
+    */
+   struct tu_suballocator pipeline_suballoc;
+   mtx_t pipeline_mutex;
+
   /* the blob seems to always use 8K factor and 128K param sizes, copy them */
 #define TU_TESS_FACTOR_SIZE (8 * 1024)
 #define TU_TESS_PARAM_SIZE (128 * 1024)
@ -495,8 +549,9 @@ struct tu_device
    * freed. Otherwise, if we are not self-importing, we get two different BO
    * handles, and we want to free each one individually.
    *
-    * The BOs in this map all have a refcnt with the reference counter and
-    * only self-imported BOs will ever have a refcnt > 1.
+    * The refcount is also useful for being able to maintain BOs across
+    * VK object lifetimes, such as pipelines suballocating out of BOs
+    * allocated on the device.
    */
   struct util_sparse_array bo_map;

@ -545,13 +600,6 @@ tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
 VkResult
 tu_device_check_status(struct vk_device *vk_device);

-enum tu_bo_alloc_flags
-{
-   TU_BO_ALLOC_NO_FLAGS = 0,
-   TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
-   TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
-};
-
 VkResult
 tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
               enum tu_bo_alloc_flags flags);
@ -573,6 +621,13 @@ tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
   return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
 }

+static inline struct tu_bo *
+tu_bo_get_ref(struct tu_bo *bo)
+{
+   p_atomic_inc(&bo->refcnt);
+   return bo;
+}
+
 /* Get a scratch bo for use inside a command buffer. This will always return
 * the same bo given the same size or similar sizes, so only one scratch bo
 * can be used at the same time. It's meant for short-lived things where we
@ -695,6 +750,9 @@ struct tu_cs
   uint32_t bo_count;
   uint32_t bo_capacity;

+   /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
+   struct tu_bo *refcount_bo;
+
   /* state for cond_exec_start/cond_exec_end */
   uint32_t cond_flags;
   uint32_t *cond_dwords;
@ -1312,6 +1370,7 @@ struct tu_pipeline
   struct vk_object_base base;

   struct tu_cs cs;
+   struct tu_suballoc_bo bo;

   /* Separate BO for private memory since it should GPU writable */
   struct tu_bo *pvtmem_bo;
--- a/src/freedreno/vulkan/tu_suballoc.c
+++ b/src/freedreno/vulkan/tu_suballoc.c
@ -0,0 +1,143 @@
+/*
+ * Copyright © 2022 Google LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Suballocator for space within BOs.
+ *
+ * BOs are allocated at PAGE_SIZE (typically 4k) granularity, so small
+ * allocations are a waste to have in their own BO.  Moreover, on DRM we track a
+ * list of all BOs currently allocated and submit the whole list for validation
+ * (busy tracking and implicit sync) on every submit, and that validation is a
+ * non-trivial cost.  So, being able to pack multiple allocations into a BO can
+ * be a significant performance win.
+ *
+ * The allocator tracks a current BO it is linearly allocating from, and up to
+ * one extra BO returned to the pool when all of its previous suballocations
+ * have been freed. This means that fragmentation can be an issue for
+ * default_size > PAGE_SIZE and small allocations.  Also, excessive BO
+ * reallocation may happen for workloads where default size < working set size.
+ */
+
+#include "tu_private.h"
+
+/* Initializes a BO sub-allocator using refcounts on BOs.
+ */
+void
+tu_bo_suballocator_init(struct tu_suballocator *suballoc,
+                        struct tu_device *dev,
+                        uint32_t default_size, uint32_t flags)
+{
+   suballoc->dev = dev;
+   suballoc->default_size = default_size;
+   suballoc->flags = flags;
+   suballoc->bo = NULL;
+   suballoc->cached_bo = NULL;
+}
+
+void
+tu_bo_suballocator_finish(struct tu_suballocator *suballoc)
+{
+   if (suballoc->bo)
+      tu_bo_finish(suballoc->dev, suballoc->bo);
+   if (suballoc->cached_bo)
+      tu_bo_finish(suballoc->dev, suballoc->cached_bo);
+}
+
+VkResult
+tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo,
+                     struct tu_suballocator *suballoc,
+                     uint32_t size, uint32_t align)
+{
+   struct tu_bo *bo = suballoc->bo;
+   if (bo) {
+      uint32_t offset = ALIGN(suballoc->next_offset, align);
+      if (offset + size <= bo->size) {
+         suballoc_bo->bo = tu_bo_get_ref(bo);
+         suballoc_bo->iova = bo->iova + offset;
+         suballoc_bo->size = size;
+
+         suballoc->next_offset = offset + size;
+         return VK_SUCCESS;
+      } else {
+         tu_bo_finish(suballoc->dev, bo);
+         suballoc->bo = NULL;
+      }
+   }
+
+   uint32_t alloc_size = MAX2(size, suballoc->default_size);
+
+   /* Reuse a recycled suballoc BO if we have one and it's big enough, otherwise free it. */
+   if (suballoc->cached_bo) {
+      if (alloc_size <= suballoc->cached_bo->size)
+         suballoc->bo = suballoc->cached_bo;
+      else
+         tu_bo_finish(suballoc->dev, suballoc->cached_bo);
+      suballoc->cached_bo = NULL;
+   }
+
+   /* Allocate the new BO if we didn't have one cached. */
+   if (!suballoc->bo) {
+      VkResult result = tu_bo_init_new(suballoc->dev, &suballoc->bo,
+                                       alloc_size,
+                                       suballoc->flags);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   VkResult result = tu_bo_map(suballoc->dev, suballoc->bo);
+   if (result != VK_SUCCESS) {
+      tu_bo_finish(suballoc->dev, suballoc->bo);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   suballoc_bo->bo = tu_bo_get_ref(suballoc->bo);
+   suballoc_bo->iova = suballoc_bo->bo->iova;
+   suballoc_bo->size = size;
+   suballoc->next_offset = size;
+
+   return VK_SUCCESS;
+}
+
+void
+tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo)
+{
+   if (!bo->bo)
+      return;
+
+   /* If we we held the last reference to this BO, so just move it to the
+    * suballocator for the next time we need to allocate.
+    */
+   if (p_atomic_read(&bo->bo->refcnt) == 1 && !suballoc->cached_bo) {
+      suballoc->cached_bo = bo->bo;
+      return;
+   }
+
+   /* Otherwise, drop the refcount on it normally. */
+   tu_bo_finish(suballoc->dev, bo->bo);
+}
+
+void *
+tu_suballoc_bo_map(struct tu_suballoc_bo *bo)
+{
+   return bo->bo->map + (bo->iova - bo->bo->iova);
+}