tu: Add a "scratch bo" allocation mechanism

This is simpler than a full-blown memory reuse mechanism, but is good enough to make sure that repeatedly doing a copy that requires the linear staging buffer workaround won't use excessive memory or be slowed down due to repeated allocations. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5007>
2020-05-11 18:46:04 +02:00 · 2020-05-11 18:46:04 +02:00 · ed79f805fa
parent 7ce527a4fe
commit ed79f805fa
2 changed files with 74 additions and 0 deletions
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@ -39,6 +39,7 @@
 #include "compiler/glsl_types.h"
 #include "util/debug.h"
 #include "util/disk_cache.h"
+#include "util/u_atomic.h"
 #include "vk_format.h"
 #include "vk_util.h"

@ -1256,6 +1257,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,

   device->mem_cache = tu_pipeline_cache_from_handle(pc);

+   for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
+      mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
+
   *pDevice = tu_device_to_handle(device);
   return VK_SUCCESS;

@ -1302,6 +1306,11 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
         vk_free(&device->alloc, device->queues[i]);
   }

+   for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) {
+      if (device->scratch_bos[i].initialized)
+         tu_bo_finish(device, &device->scratch_bos[i].bo);
+   }
+
   /* the compiler does not use pAllocator */
   ralloc_free(device->compiler);

@ -1311,6 +1320,51 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   vk_free(&device->alloc, device);
 }

+VkResult
+tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo)
+{
+   unsigned size_log2 = MAX2(util_logbase2_ceil64(size), MIN_SCRATCH_BO_SIZE_LOG2);
+   unsigned index = size_log2 - MIN_SCRATCH_BO_SIZE_LOG2;
+   assert(index < ARRAY_SIZE(dev->scratch_bos));
+
+   for (unsigned i = index; i < ARRAY_SIZE(dev->scratch_bos); i++) {
+      if (p_atomic_read(&dev->scratch_bos[i].initialized)) {
+         /* Fast path: just return the already-allocated BO. */
+         *bo = &dev->scratch_bos[i].bo;
+         return VK_SUCCESS;
+      }
+   }
+
+   /* Slow path: actually allocate the BO. We take a lock because the process
+    * of allocating it is slow, and we don't want to block the CPU while it
+    * finishes.
+   */
+   mtx_lock(&dev->scratch_bos[index].construct_mtx);
+
+   /* Another thread may have allocated it already while we were waiting on
+    * the lock. We need to check this in order to avoid double-allocating.
+    */
+   if (dev->scratch_bos[index].initialized) {
+      mtx_unlock(&dev->scratch_bos[index].construct_mtx);
+      *bo = &dev->scratch_bos[index].bo;
+      return VK_SUCCESS;
+   }
+
+   unsigned bo_size = 1ull << size_log2;
+   VkResult result = tu_bo_init_new(dev, &dev->scratch_bos[index].bo, bo_size);
+   if (result != VK_SUCCESS) {
+      mtx_unlock(&dev->scratch_bos[index].construct_mtx);
+      return result;
+   }
+
+   p_atomic_set(&dev->scratch_bos[index].initialized, true);
+
+   mtx_unlock(&dev->scratch_bos[index].construct_mtx);
+
+   *bo = &dev->scratch_bos[index].bo;
+   return VK_SUCCESS;
+}
+
 VkResult
 tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
                                    VkLayerProperties *pProperties)
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -509,6 +509,17 @@ struct tu_device
   uint32_t vsc_draw_strm_pitch;
   uint32_t vsc_prim_strm_pitch;

+#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
+
+   /* Currently the kernel driver uses a 32-bit GPU address space, but it
+    * should be impossible to go beyond 48 bits.
+    */
+   struct {
+      struct tu_bo bo;
+      mtx_t construct_mtx;
+      bool initialized;
+   } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
+
   struct tu_bo border_color;

   struct list_head shader_slabs;
@ -531,6 +542,15 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
 VkResult
 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);

+/* Get a scratch bo for use inside a command buffer. This will always return
+ * the same bo given the same size or similar sizes, so only one scratch bo
+ * can be used at the same time. It's meant for short-lived things where we
+ * need to write to some piece of memory, read from it, and then immediately
+ * discard it.
+ */
+VkResult
+tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
+
 struct tu_cs_entry
 {
   /* No ownership */