turnip: Move autotune buffers to suballoc.

Now the ANGLE trex_200 trace replay does a single BO allocation at startup for autotune results instead of one per frame (~350 for the whole replay). Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15038>
2022-03-18 10:31:12 -07:00 · 2022-03-18 10:31:12 -07:00 · 835704e669
parent 7c636acd53
commit 835704e669
5 changed files with 82 additions and 151 deletions
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@ -57,6 +57,9 @@
 * time, so in most cases there will be no locking.
 */

+void
+tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
+
 #define TU_AUTOTUNE_DEBUG_LOG 0
 /* Dump history entries on autotuner finish,
 * could be used to gather data from traces.
@ -68,7 +71,6 @@
 /* For how many submissions we store renderpass stats. */
 #define MAX_HISTORY_LIFETIME 128

-#define TU_AUTOTUNE_RP_BO_SIZE 4096

 /**
 * Tracks results for a given renderpass key
@ -88,62 +90,12 @@ struct tu_renderpass_history {
   uint32_t avg_samples;
 };

-struct tu_autotune_results_buffer
-{
-   int32_t ref_cnt;
-
-   struct tu_device *device;
-
-   /* TODO: It would be better to suballocate the space from
-    * a memory pool which would create less BOs and waste less space.
-    */
-   struct tu_bo **bos;
-   uint32_t num_bos;
-   uint32_t results_written;
-};
-
-static struct tu_autotune_results_buffer*
-tu_autotune_results_buffer_create(struct tu_device *dev)
-{
-   struct tu_autotune_results_buffer* buffer =
-      malloc(sizeof(struct tu_autotune_results_buffer));
-
-   buffer->ref_cnt = 1;
-   buffer->device = dev;
-   buffer->results_written = 0;
-   buffer->num_bos = 0;
-   buffer->bos = NULL;
-
-   return buffer;
-}
-
-void
-tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer)
-{
-   assert(buffer && buffer->ref_cnt >= 1);
-   p_atomic_inc(&buffer->ref_cnt);
-}
-
-void
-tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer)
-{
-   assert(buffer && buffer->ref_cnt >= 1);
-   if (p_atomic_dec_zero(&buffer->ref_cnt)) {
-      for (int i = 0; i < buffer->num_bos; i++)
-         tu_bo_finish(buffer->device, buffer->bos[i]);
-
-      ralloc_free(buffer->bos);
-      free(buffer);
-   }
-}
-
 /* Holds per-submission cs which writes the fence. */
 struct tu_submission_data {
   struct list_head node;
   uint32_t fence;

   struct tu_cs fence_cs;
-   struct tu_autotune_results_buffer **buffers;
   uint32_t buffers_count;
 };

@ -175,11 +127,7 @@ free_submission_data(struct tu_submission_data *data)
 {
   list_del(&data->node);
   tu_cs_finish(&data->fence_cs);
-   for (uint32_t i = 0; i < data->buffers_count; i++) {
-      tu_autotune_results_buffer_unref(data->buffers[i]);
-   }

-   free(data->buffers);
   free(data);
 }

@ -220,16 +168,17 @@ hash_renderpass_instance(const struct tu_render_pass *pass,
 }

 static void
-free_result(struct tu_renderpass_result *result)
+free_result(struct tu_device *dev, struct tu_renderpass_result *result)
 {
+   tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
   list_del(&result->node);
   free(result);
 }

 static void
-free_history(struct tu_renderpass_history *history)
+free_history(struct tu_device *dev, struct tu_renderpass_history *history)
 {
-   tu_autotune_free_results(&history->results);
+   tu_autotune_free_results_locked(dev, &history->results);
   free(history);
 }

@ -266,7 +215,7 @@ create_history_result(struct tu_autotune *at, uint64_t rp_key)
 }

 static void
-history_add_result(struct tu_renderpass_history *history,
+history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
                      struct tu_renderpass_result *result)
 {
   list_delinit(&result->node);
@ -280,7 +229,9 @@ history_add_result(struct tu_renderpass_history *history,
       */
      struct tu_renderpass_result *old_result =
         list_last_entry(&history->results, struct tu_renderpass_result, node);
-      free_result(old_result);
+      mtx_lock(&dev->autotune_mutex);
+      free_result(dev, old_result);
+      mtx_unlock(&dev->autotune_mutex);
   }

   /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
@ -297,7 +248,8 @@ history_add_result(struct tu_renderpass_history *history,
 static void
 process_results(struct tu_autotune *at)
 {
-   struct tu6_global *global = at->device->global_bo->map;
+   struct tu_device *dev = at->device;
+   struct tu6_global *global = dev->global_bo->map;
   uint32_t current_fence = global->autotune_fence;

   list_for_each_entry_safe(struct tu_renderpass_result, result,
@ -309,7 +261,7 @@ process_results(struct tu_autotune *at)
      result->samples_passed =
         result->samples->samples_end - result->samples->samples_start;

-      history_add_result(history, result);
+      history_add_result(dev, history, result);
   }

   list_for_each_entry_safe(struct tu_submission_data, submission_data,
@ -338,6 +290,7 @@ queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
         /* TODO: copying each result isn't nice */
         struct tu_renderpass_result *copy = malloc(sizeof(*result));
         *copy = *result;
+         tu_bo_get_ref(copy->bo.bo);
         list_addtail(&copy->node, &at->pending_results);
      }
   }
@ -393,19 +346,13 @@ tu_autotune_on_submit(struct tu_device *dev,
   struct tu_submission_data *submission_data =
      create_submission_data(dev, at);
   submission_data->buffers_count = result_buffers;
-   submission_data->buffers =
-      malloc(sizeof(struct tu_autotune_results_buffer *) * result_buffers);

-   uint32_t buffer_idx = 0;
   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
      if (list_is_empty(&cmdbuf->renderpass_autotune_results))
         continue;

      queue_pending_results(at, cmdbuf);
-
-      submission_data->buffers[buffer_idx++] = cmdbuf->autotune_buffer;
-      tu_autotune_results_buffer_ref(cmdbuf->autotune_buffer);
   }

 #if TU_AUTOTUNE_DEBUG_LOG != 0
@ -430,7 +377,9 @@ tu_autotune_on_submit(struct tu_device *dev,
      _mesa_hash_table_remove_key(at->ht, &history->key);
      u_rwlock_wrunlock(&at->ht_lock);

-      free_history(history);
+      mtx_lock(&dev->autotune_mutex);
+      free_history(dev, history);
+      mtx_unlock(&dev->autotune_mutex);
   }

   return &submission_data->fence_cs;
@ -480,12 +429,14 @@ tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
   }
 #endif

-   tu_autotune_free_results(&at->pending_results);
+   tu_autotune_free_results(dev, &at->pending_results);

+   mtx_lock(&dev->autotune_mutex);
   hash_table_foreach(at->ht, entry) {
      struct tu_renderpass_history *history = entry->data;
-      free_history(history);
+      free_history(dev, history);
   }
+   mtx_unlock(&dev->autotune_mutex);

   list_for_each_entry_safe(struct tu_submission_data, submission_data,
                            &at->pending_submission_data, node) {
@ -510,14 +461,22 @@ tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
 }

 void
-tu_autotune_free_results(struct list_head *results)
+tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
 {
   list_for_each_entry_safe(struct tu_renderpass_result, result,
                            results, node) {
-      free_result(result);
+      free_result(dev, result);
   }
 }

+void
+tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
+{
+   mtx_lock(&dev->autotune_mutex);
+   tu_autotune_free_results_locked(dev, results);
+   mtx_unlock(&dev->autotune_mutex);
+}
+
 static bool
 fallback_use_bypass(const struct tu_render_pass *pass,
                    const struct tu_framebuffer *framebuffer,
@ -624,32 +583,6 @@ tu_autotune_use_bypass(struct tu_autotune *at,
   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
 }

-static uint32_t
-get_offset_for_renderpass(struct tu_autotune_results_buffer *buffer)
-{
-   uint32_t results_per_bo =
-      TU_AUTOTUNE_RP_BO_SIZE / sizeof(struct tu_renderpass_samples);
-   return (buffer->results_written % results_per_bo) *
-          sizeof(struct tu_renderpass_samples);
-}
-
-static struct tu_bo *
-get_bo_for_renderpass(struct tu_autotune_results_buffer *buffer)
-{
-   if (get_offset_for_renderpass(buffer) == 0) {
-      buffer->num_bos++;
-      buffer->bos =
-         reralloc(NULL, buffer->bos, struct tu_bo *, buffer->num_bos);
-      struct tu_bo **new_bo = &buffer->bos[buffer->num_bos - 1];
-
-      tu_bo_init_new(buffer->device, new_bo, TU_AUTOTUNE_RP_BO_SIZE,
-                     TU_BO_ALLOC_NO_FLAGS);
-      tu_bo_map(buffer->device, *new_bo);
-   }
-
-   return buffer->bos[buffer->num_bos - 1];
-}
-
 void
 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                             struct tu_cs *cs,
@ -658,21 +591,21 @@ tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
   if (!autotune_result)
      return;

-   /* Lazily allocate memory for renderpass results.
-    * Secondary command buffers do not support renderpasses.
-    */
-   assert(cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-   if (!cmd->autotune_buffer) {
-      cmd->autotune_buffer = tu_autotune_results_buffer_create(cmd->device);
+   struct tu_device *dev = cmd->device;
+
+   static const uint32_t size = sizeof(struct tu_renderpass_samples);
+
+   mtx_lock(&dev->autotune_mutex);
+   VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
+   mtx_unlock(&dev->autotune_mutex);
+   if (ret != VK_SUCCESS) {
+      autotune_result->bo.iova = 0;
+      return;
   }

-   uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
-   struct tu_bo *bo = get_bo_for_renderpass(cmd->autotune_buffer);
+   uint64_t result_iova = autotune_result->bo.iova;

-   uint64_t result_iova = bo->iova + bo_offset;
-
-   autotune_result->samples =
-      (struct tu_renderpass_samples *) (bo->map + bo_offset);
+   autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);

   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));

@ -689,11 +622,10 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
   if (!autotune_result)
      return;

-   uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
-   struct tu_bo *bo = cmd->autotune_buffer->bos[cmd->autotune_buffer->num_bos - 1];
-   cmd->autotune_buffer->results_written += 1;
+   if (!autotune_result->bo.iova)
+      return;

-   uint64_t result_iova = bo->iova + bo_offset +
+   uint64_t result_iova = autotune_result->bo.iova +
                          offsetof(struct tu_renderpass_samples, samples_end);

   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@ -32,6 +32,7 @@ struct tu_device;
 struct tu_cmd_buffer;

 struct tu_renderpass_history;
+struct tu_renderpass_result;

 /**
 * "autotune" our decisions about bypass vs GMEM rendering, based on historical
@ -111,32 +112,13 @@ struct tu_renderpass_samples {
   uint64_t __pad1;
 };

-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
-   /* Points into GPU memory */
-   struct tu_renderpass_samples* samples;
-
-   /*
-    * Below here, only used internally within autotune
-    */
-   uint64_t rp_key;
-   struct tu_renderpass_history *history;
-   struct list_head node;
-   uint32_t fence;
-   uint64_t samples_passed;
-};
-
 VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
 void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);

 bool tu_autotune_use_bypass(struct tu_autotune *at,
                            struct tu_cmd_buffer *cmd_buffer,
                            struct tu_renderpass_result **autotune_result);
-void tu_autotune_free_results(struct list_head *results);
+void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);

 bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
                                       uint32_t cmd_buffer_count);
@ -152,9 +134,6 @@ struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,

 struct tu_autotune_results_buffer;

-void tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer);
-void tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer);
-
 void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                                  struct tu_cs *cs,
                                  struct tu_renderpass_result *autotune_result);
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@ -1514,9 +1514,7 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)

   u_trace_fini(&cmd_buffer->trace);

-   if (cmd_buffer->autotune_buffer)
-      tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
-   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);

   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      if (cmd_buffer->descriptors[i].push_set.layout)
@ -1542,16 +1540,7 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
   tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
   tu_cs_reset(&cmd_buffer->sub_cs);

-   /* We can't just reset the autotune_buffer's contents, because it is also
-    * referenced by the submission_data if the command buffer was submitted
-    * and we may be accessing it after cmdbuf reset/free.
-    */
-   if (cmd_buffer->autotune_buffer) {
-      tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
-      cmd_buffer->autotune_buffer = NULL;
-   }
-
-   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);

   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@ -1729,6 +1729,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,

   mtx_init(&device->bo_mutex, mtx_plain);
   mtx_init(&device->pipeline_mutex, mtx_plain);
+   mtx_init(&device->autotune_mutex, mtx_plain);
   u_rwlock_init(&device->dma_bo_lock);
   pthread_mutex_init(&device->submit_mutex, NULL);

@ -1789,6 +1790,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,

   tu_bo_suballocator_init(&device->pipeline_suballoc, device,
                           128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
+   tu_bo_suballocator_init(&device->autotune_suballoc, device,
+                           128 * 1024, 0);

   result = tu_bo_init_new(device, &device->global_bo, global_size,
                           TU_BO_ALLOC_ALLOW_DUMP);
@ -1992,6 +1995,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   tu_autotune_fini(&device->autotune, device);

   tu_bo_suballocator_finish(&device->pipeline_suballoc);
+   tu_bo_suballocator_finish(&device->autotune_suballoc);

   util_sparse_array_finish(&device->bo_map);
   u_rwlock_destroy(&device->dma_bo_lock);
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -438,6 +438,27 @@ enum global_shader {
   GLOBAL_SH_COUNT,
 };

+/**
+ * Tracks the results from an individual renderpass. Initially created
+ * per renderpass, and appended to the tail of at->pending_results. At a later
+ * time, when the GPU has finished writing the results, we fill samples_passed.
+ */
+struct tu_renderpass_result {
+   /* Points into GPU memory */
+   struct tu_renderpass_samples* samples;
+
+   struct tu_suballoc_bo bo;
+
+   /*
+    * Below here, only used internally within autotune
+    */
+   uint64_t rp_key;
+   struct tu_renderpass_history *history;
+   struct list_head node;
+   uint32_t fence;
+   uint64_t samples_passed;
+};
+
 #define TU_BORDER_COLOR_COUNT 4096
 #define TU_BORDER_COLOR_BUILTIN 6

@ -514,6 +535,12 @@ struct tu_device
   struct tu_suballocator pipeline_suballoc;
   mtx_t pipeline_mutex;

+   /* Device-global BO suballocator for reducing BO management for small
+    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
+    */
+   struct tu_suballocator autotune_suballoc;
+   mtx_t autotune_mutex;
+
   /* the blob seems to always use 8K factor and 128K param sizes, copy them */
 #define TU_TESS_FACTOR_SIZE (8 * 1024)
 #define TU_TESS_PARAM_SIZE (128 * 1024)