tu: implement sysmem vs gmem autotuner

The implementation is separate from Freedreno due to multithreading support. In Vulkan application may fill command buffer from many threads and expect no locking to occur. We do introduce the possibility of locking on renderpass end, however assuming that application doesn't have a huge amount of slightly different renderpasses, there would be minimal to none contention. Other assumptions are: - Application does submit command buffers soon after their creation. Breaking the above may lead to some decrease in performance or autotuner turning itself off. The heuristic is too simplistic at the moment, to find a proper one - we should run a bunch of traces with sysmem and gmem, and build better heuristic from gathered data. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12128>
2021-07-29 14:21:05 +03:00 · 2021-07-29 14:21:05 +03:00 · dbae9fa7d8
parent d6e457c0a4
commit dbae9fa7d8
9 changed files with 948 additions and 45 deletions
--- a/src/freedreno/vulkan/meson.build
+++ b/src/freedreno/vulkan/meson.build
@ -31,6 +31,7 @@ tu_entrypoints = custom_target(


 libtu_files = files(
+  'tu_autotune.c',
  'tu_clear_blit.c',
  'tu_cmd_buffer.c',
  'tu_cs.c',
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@ -0,0 +1,547 @@
+/*
+ * Copyright © 2021 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <vulkan/vulkan_core.h>
+
+#include "tu_autotune.h"
+#include "tu_private.h"
+#include "tu_cs.h"
+
+/* In Vulkan application may fill command buffer from many threads
+ * and expect no locking to occur. We do introduce the possibility of
+ * locking on renderpass end, however assuming that application
+ * doesn't have a huge amount of slightly different renderpasses,
+ * there would be minimal to none contention.
+ *
+ * Other assumptions are:
+ * - Application does submit command buffers soon after their creation.
+ *
+ * Breaking the above may lead to some decrease in performance or
+ * autotuner turning itself off.
+ */
+
+#define TU_AUTOTUNE_DEBUG_LOG 0
+/* Dump history entries on autotuner finish,
+ * could be used to gather data from traces.
+ */
+#define TU_AUTOTUNE_LOG_AT_FINISH 0
+
+#define MAX_HISTORY_RESULTS 5
+#define MAX_HISTORY_LIFETIME 128
+
+/**
+ * Tracks results for a given renderpass key
+ */
+struct tu_renderpass_history {
+   uint64_t key;
+
+   /* We would delete old history entries */
+   uint32_t last_fence;
+
+   /**
+    * List of recent fd_renderpass_result's
+    */
+   struct list_head results;
+   uint32_t num_results;
+
+   uint32_t avg_samples;
+};
+
+/* Holds per-submission cs which writes the fence. */
+struct tu_submission_fence_cs {
+   struct list_head node;
+   struct tu_cs cs;
+   uint32_t fence;
+};
+
+#define APPEND_TO_HASH(state, field) \
+   XXH64_update(state, &field, sizeof(field));
+
+static uint64_t
+hash_renderpass_instance(const struct tu_render_pass *pass,
+                         const struct tu_framebuffer *framebuffer,
+                         const struct tu_cmd_buffer *cmd) {
+   XXH64_state_t hash_state;
+   XXH64_reset(&hash_state, 0);
+
+   APPEND_TO_HASH(&hash_state, framebuffer->width);
+   APPEND_TO_HASH(&hash_state, framebuffer->height);
+   APPEND_TO_HASH(&hash_state, framebuffer->layers);
+
+   APPEND_TO_HASH(&hash_state, pass->attachment_count);
+   XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));
+
+   for (unsigned i = 0; i < pass->attachment_count; i++) {
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk_format);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->layer_count);
+      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->level_count);
+   }
+
+   APPEND_TO_HASH(&hash_state, pass->subpass_count);
+   for (unsigned i = 0; i < pass->subpass_count; i++) {
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
+      APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
+   }
+
+   return XXH64_digest(&hash_state);
+}
+
+static void
+history_destructor(void *h)
+{
+   struct tu_renderpass_history *history = h;
+
+   list_for_each_entry_safe(struct tu_renderpass_result, result,
+                            &history->results, node) {
+      ralloc_free(result);
+   }
+}
+
+static void
+result_destructor(void *r)
+{
+   struct tu_renderpass_result *result = r;
+
+   /* Just in case we manage to somehow still be on the pending_results list: */
+   list_del(&result->node);
+}
+
+static bool
+get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
+{
+   bool has_history = false;
+
+   /* If the lock contantion would be found in the wild -
+    * we could use try_lock here.
+    */
+   u_rwlock_rdlock(&at->ht_lock);
+   struct hash_entry *entry =
+      _mesa_hash_table_search(at->ht, &rp_key);
+   if (entry) {
+      struct tu_renderpass_history *history = entry->data;
+      if (history->num_results > 0) {
+         *avg_samples = p_atomic_read(&history->avg_samples);
+         has_history = true;
+      }
+   }
+   u_rwlock_rdunlock(&at->ht_lock);
+
+   return has_history;
+}
+
+static struct tu_renderpass_result *
+create_history_result(struct tu_autotune *at, uint64_t rp_key)
+{
+   struct tu_renderpass_result *result = rzalloc_size(NULL, sizeof(*result));
+
+   result->idx = p_atomic_inc_return(&at->idx_counter);
+   result->rp_key = rp_key;
+
+   ralloc_set_destructor(result, result_destructor);
+
+   return result;
+}
+
+static void
+history_add_result(struct tu_renderpass_history *history,
+                      struct tu_renderpass_result *result)
+{
+   list_delinit(&result->node);
+   list_add(&result->node, &history->results);
+
+   if (history->num_results < MAX_HISTORY_RESULTS) {
+      history->num_results++;
+   } else {
+      /* Once above the limit, start popping old results off the
+       * tail of the list:
+       */
+      struct tu_renderpass_result *old_result =
+         list_last_entry(&history->results, struct tu_renderpass_result, node);
+      list_delinit(&old_result->node);
+      ralloc_free(old_result);
+   }
+
+   /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
+   uint32_t total_samples = 0;
+   list_for_each_entry(struct tu_renderpass_result, result,
+                       &history->results, node) {
+      total_samples += result->samples_passed;
+   }
+
+   float avg_samples = (float)total_samples / (float)history->num_results;
+   p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
+}
+
+static void
+process_results(struct tu_autotune *at)
+{
+   uint32_t current_fence = at->results->fence;
+
+   uint32_t min_idx = ~0;
+   uint32_t max_idx = 0;
+
+   list_for_each_entry_safe(struct tu_renderpass_result, result,
+                            &at->pending_results, node) {
+      if (result->fence > current_fence)
+         break;
+
+      struct tu_renderpass_history *history = result->history;
+
+      min_idx = MIN2(min_idx, result->idx);
+      max_idx = MAX2(max_idx, result->idx);
+      uint32_t idx = result->idx % ARRAY_SIZE(at->results->result);
+
+      result->samples_passed = at->results->result[idx].samples_end -
+                               at->results->result[idx].samples_start;
+
+      history_add_result(history, result);
+   }
+
+   list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs,
+                            &at->pending_submission_cs, node) {
+      if (submission_cs->fence > current_fence)
+         break;
+
+      list_del(&submission_cs->node);
+      tu_cs_finish(&submission_cs->cs);
+      free(submission_cs);
+   }
+
+   if (max_idx - min_idx > TU_AUTOTUNE_MAX_RESULTS) {
+      /* If results start to trample each other it's better to bail out */
+      at->enabled = false;
+      mesa_logw("disabling sysmem vs gmem autotuner because results "
+                "are trampling each other: min_idx=%u, max_idx=%u",
+                min_idx, max_idx);
+   }
+}
+
+static struct tu_cs *
+create_fence_cs(struct tu_device *dev, struct tu_autotune *at)
+{
+   struct tu_submission_fence_cs *submission_cs =
+      calloc(1, sizeof(struct tu_submission_fence_cs));
+   submission_cs->fence = at->fence_counter;
+
+   tu_cs_init(&submission_cs->cs, dev, TU_CS_MODE_GROW, 5);
+   tu_cs_begin(&submission_cs->cs);
+
+   tu_cs_emit_pkt7(&submission_cs->cs, CP_EVENT_WRITE, 4);
+   tu_cs_emit(&submission_cs->cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
+   tu_cs_emit_qw(&submission_cs->cs, autotune_results_ptr(at, fence));
+   tu_cs_emit(&submission_cs->cs, at->fence_counter);
+
+   tu_cs_end(&submission_cs->cs);
+
+   list_addtail(&submission_cs->node, &at->pending_submission_cs);
+
+   return &submission_cs->cs;
+}
+
+struct tu_cs *
+tu_autotune_on_submit(struct tu_device *dev,
+                      struct tu_autotune *at,
+                      struct tu_cmd_buffer **cmd_buffers,
+                      uint32_t cmd_buffer_count)
+{
+   /* We are single-threaded here */
+
+   process_results(at);
+
+   /* pre-increment so zero isn't valid fence */
+   uint32_t new_fence = ++at->fence_counter;
+
+   /* Create history entries here to minimize work and locking being
+    * done on renderpass end.
+    */
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
+      list_for_each_entry_safe(struct tu_renderpass_result, result,
+                          &cmdbuf->renderpass_autotune_results, node) {
+         struct tu_renderpass_history *history;
+         struct hash_entry *entry =
+            _mesa_hash_table_search(at->ht, &result->rp_key);
+         if (!entry) {
+            history = rzalloc_size(NULL, sizeof(*history));
+            ralloc_set_destructor(history, history_destructor);
+            history->key = result->rp_key;
+            list_inithead(&history->results);
+
+            u_rwlock_wrlock(&at->ht_lock);
+            _mesa_hash_table_insert(at->ht, &history->key, history);
+            u_rwlock_wrunlock(&at->ht_lock);
+         } else {
+            history = (struct tu_renderpass_history *) entry->data;
+         }
+
+         history->last_fence = new_fence;
+
+         result->fence = new_fence;
+         result->history = history;
+      }
+
+      if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
+         list_splicetail(&cmdbuf->renderpass_autotune_results,
+                         &at->pending_results);
+         list_inithead(&cmdbuf->renderpass_autotune_results);
+      }
+   }
+
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+   mesa_logi("Total history entries: %u", at->ht->entries);
+#endif
+
+   /* Cleanup old entries from history table. The assumption
+    * here is that application doesn't hold many old unsubmitted
+    * command buffers, otherwise this table may grow big.
+    */
+   hash_table_foreach(at->ht, entry) {
+      struct tu_renderpass_history *history = entry->data;
+      if (history->last_fence == 0 ||
+          (new_fence - history->last_fence) <= MAX_HISTORY_LIFETIME)
+         continue;
+
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+      mesa_logi("Removed old history entry %016"PRIx64"", history->key);
+#endif
+
+      u_rwlock_wrlock(&at->ht_lock);
+      _mesa_hash_table_remove_key(at->ht, &history->key);
+      u_rwlock_wrunlock(&at->ht_lock);
+
+      ralloc_free(history);
+   }
+
+   return create_fence_cs(dev, at);
+}
+
+static bool
+renderpass_key_equals(const void *_a, const void *_b)
+{
+   return *(uint64_t *)_a == *(uint64_t *)_b;
+}
+
+static uint32_t
+renderpass_key_hash(const void *_a)
+{
+   return *((uint64_t *) _a) & 0xffffffff;
+}
+
+VkResult
+tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
+{
+   VkResult result;
+
+   at->enabled = true;
+   at->ht = _mesa_hash_table_create(NULL,
+                                    renderpass_key_hash,
+                                    renderpass_key_equals);
+   u_rwlock_init(&at->ht_lock);
+
+   at->results_bo = malloc(sizeof(struct tu_bo));
+   result = tu_bo_init_new(dev, at->results_bo,
+                           sizeof(struct tu_autotune_results),
+                           TU_BO_ALLOC_NO_FLAGS);
+   if (result != VK_SUCCESS) {
+      vk_startup_errorf(dev->instance, result, "Autotune BO init");
+      goto fail_bo;
+   }
+
+   result = tu_bo_map(dev, at->results_bo);
+
+   if (result != VK_SUCCESS) {
+      vk_startup_errorf(dev->instance, result, "Autotune BO map");
+      goto fail_map_bo;
+   }
+
+   at->results = at->results_bo->map;
+
+   list_inithead(&at->pending_results);
+   list_inithead(&at->pending_submission_cs);
+
+   return VK_SUCCESS;
+
+fail_map_bo:
+   tu_bo_finish(dev, at->results_bo);
+
+fail_bo:
+   free(at->results_bo);
+   u_rwlock_destroy(&at->ht_lock);
+   _mesa_hash_table_destroy(at->ht, NULL);
+
+   return result;
+}
+
+void
+tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
+{
+#if TU_AUTOTUNE_LOG_AT_FINISH != 0
+   while (!list_is_empty(&at->pending_results)) {
+      process_results(at);
+   }
+
+   hash_table_foreach(at->ht, entry) {
+      struct tu_renderpass_history *history = entry->data;
+
+      mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u",
+                history->key, history->avg_samples, history->num_results);
+   }
+#endif
+
+   tu_autotune_free_results(&at->pending_results);
+
+   hash_table_foreach(at->ht, entry) {
+      struct tu_renderpass_history *history = entry->data;
+      ralloc_free(history);
+   }
+
+   list_for_each_entry_safe(struct tu_submission_fence_cs, submission_cs,
+                            &at->pending_submission_cs, node) {
+      tu_cs_finish(&submission_cs->cs);
+      free(submission_cs);
+   }
+
+   _mesa_hash_table_destroy(at->ht, NULL);
+   u_rwlock_destroy(&at->ht_lock);
+   tu_bo_finish(dev, at->results_bo);
+   free(at->results_bo);
+}
+
+bool
+tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
+                                  uint32_t cmd_buffer_count)
+{
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
+      if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
+         return true;
+   }
+
+   return false;
+}
+
+void
+tu_autotune_free_results(struct list_head *results)
+{
+   list_for_each_entry_safe(struct tu_renderpass_result, result,
+                            results, node) {
+      ralloc_free(result);
+   }
+}
+
+static bool
+fallback_use_bypass(const struct tu_render_pass *pass,
+                    const struct tu_framebuffer *framebuffer,
+                    const struct tu_cmd_buffer *cmd_buffer)
+{
+   if (cmd_buffer->state.drawcall_count > 5)
+      return false;
+
+   for (unsigned i = 0; i < pass->subpass_count; i++) {
+      if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
+         return false;
+   }
+
+   return true;
+}
+
+bool
+tu_autotune_use_bypass(struct tu_autotune *at,
+                       struct tu_cmd_buffer *cmd_buffer,
+                       struct tu_renderpass_result **autotune_result)
+{
+   const struct tu_render_pass *pass = cmd_buffer->state.pass;
+   const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+
+   /* If we would want to support buffers that could be submitted
+    * several times we would have to copy the sample counts of renderpasses
+    * after each submission of such buffer (like with u_trace support).
+    * This is rather messy and since almost all apps use ONE_TIME_SUBMIT
+    * we choose to unconditionally use fallback.
+    */
+   bool one_time_submit = cmd_buffer->usage_flags &
+      VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+
+   if (!at->enabled || !one_time_submit)
+      return fallback_use_bypass(pass, framebuffer, cmd_buffer);
+
+   /* We use 64bit hash as a key since we don't fear rare hash collision,
+    * the worst that would happen is sysmem being selected when it should
+    * have not, and with 64bit it would be extremely rare.
+    *
+    * Q: Why not make the key from framebuffer + renderpass pointers?
+    * A: At least DXVK creates new framebuffers each frame while keeping
+    *    renderpasses the same. Also we want to support replaying a single
+    *    frame in a loop for testing.
+    */
+   uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
+
+   *autotune_result = create_history_result(at, renderpass_key);
+
+   uint32_t avg_samples = 0;
+   if (get_history(at, renderpass_key, &avg_samples)) {
+      /* TODO we should account for load/stores/clears/resolves especially
+       * with low drawcall count and ~fb_size samples passed, in D3D11 games
+       * we are seeing many renderpasses like:
+       *  - color attachment load
+       *  - single fullscreen draw
+       *  - color attachment store
+       */
+
+      /* Low sample count could mean there was only a clear.. or there was
+       * a clear plus draws that touch no or few samples
+       */
+      if (avg_samples < 500) {
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+         mesa_logi("%016"PRIx64":%u\t avg_samples=%u selecting sysmem",
+            renderpass_key, cmd_buffer->state.drawcall_count, avg_samples);
+#endif
+         return true;
+      }
+
+      /* Cost-per-sample is an estimate for the average number of reads+
+       * writes for a given passed sample.
+       */
+      float sample_cost = cmd_buffer->state.total_drawcalls_cost;
+      sample_cost /= cmd_buffer->state.drawcall_count;
+
+      float single_draw_cost = (avg_samples * sample_cost) / cmd_buffer->state.drawcall_count;
+
+      bool select_sysmem = single_draw_cost < 6000.0;
+
+#if TU_AUTOTUNE_DEBUG_LOG != 0
+      mesa_logi("%016"PRIx64":%u\t avg_samples=%u, "
+          "sample_cost=%f, single_draw_cost=%f selecting %s",
+          renderpass_key, cmd_buffer->state.drawcall_count, avg_samples,
+          sample_cost, single_draw_cost, select_sysmem ? "sysmem" : "gmem");
+#endif
+
+      return select_sysmem;
+   }
+
+   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
+}
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@ -0,0 +1,187 @@
+/*
+ * Copyright © 2021 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef TU_AUTOTUNE_H
+#define TU_AUTOTUNE_H
+
+#include "util/hash_table.h"
+#include "util/list.h"
+#include "util/rwlock.h"
+
+#define autotune_offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
+#define autotune_results_ptr(at, member)             \
+   (at->results_bo->iova +                           \
+      autotune_offset((at)->results, &(at)->results->member))
+
+struct tu_device;
+struct tu_cmd_buffer;
+
+struct tu_autotune_results;
+struct tu_renderpass_history;
+
+/**
+ * "autotune" our decisions about bypass vs GMEM rendering, based on historical
+ * data about a given render target.
+ *
+ * In deciding which path to take there are tradeoffs, including some that
+ * are not reasonably estimateable without having some additional information:
+ *
+ *  (1) If you know you are touching every pixel (ie. there is a clear),
+ *      then the GMEM path will at least not cost more memory bandwidth than
+ *      sysmem[1]
+ *
+ *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
+ *      if there is sysmem->GMEM restore pass.
+ *
+ *  (3) If you see a high draw count, that is an indication that there will be
+ *      enough pixels accessed multiple times to benefit from the reduced
+ *      memory bandwidth that GMEM brings
+ *
+ *  (4) But high draw count where there is not much overdraw can actually be
+ *      faster in bypass mode if it is pushing a lot of state change, due to
+ *      not having to go thru the state changes per-tile[1]
+ *
+ * The approach taken is to measure the samples-passed for the batch to estimate
+ * the amount of overdraw to detect cases where the number of pixels touched is
+ * low.
+ *
+ * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
+ *     most of the tiles late in the tile-pass can defeat that
+ */
+struct tu_autotune {
+
+   /* We may have to disable autotuner if there are too many
+    * renderpasses in-flight.
+    */
+   bool enabled;
+
+   /**
+    * Cache to map renderpass key to historical information about
+    * rendering to that particular render target.
+    */
+   struct hash_table *ht;
+   struct u_rwlock ht_lock;
+
+   /**
+    * GPU buffer used to communicate back results to the CPU
+    */
+   struct tu_bo *results_bo;
+   struct tu_autotune_results *results;
+
+   /**
+    * List of per-renderpass results that we are waiting for the GPU
+    * to finish with before reading back the results.
+    */
+   struct list_head pending_results;
+
+   /**
+    * List of per-submission CS that we are waiting for the GPU
+    * to finish using.
+    */
+   struct list_head pending_submission_cs;
+
+   uint32_t fence_counter;
+   uint32_t idx_counter;
+};
+
+#define TU_AUTOTUNE_MAX_RESULTS 256
+
+/**
+ * The layout of the memory used to read back per-batch results from the
+ * GPU
+ *
+ * Note this struct is intentionally aligned to 4k.  And hw requires the
+ * sample start/stop locations to be 128b aligned.
+ */
+struct tu_autotune_results {
+
+   /**
+    * The GPU writes back a "fence" seqno value from the cmdstream after
+    * it finishes the submission, so that the CPU knows when
+    * results are valid.
+    */
+   uint32_t fence;
+
+   uint32_t __pad0;
+   uint64_t __pad1;
+
+   /**
+    * From the cmdstream, the captured samples-passed values are recorded
+    * at the start and end of the batch.
+    *
+    * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
+    * may force us to revisit that.
+    */
+   struct {
+      uint64_t samples_start;
+      uint64_t __pad0;
+      uint64_t samples_end;
+      uint64_t __pad1;
+   } result[TU_AUTOTUNE_MAX_RESULTS];
+};
+
+/**
+ * Tracks the results from an individual renderpass. Initially created
+ * per renderpass, and appended to the tail of at->pending_results. At a later
+ * time, when the GPU has finished writing the results, we fill samples_passed.
+ */
+struct tu_renderpass_result {
+
+   /**
+    * The index/slot in tu_autotune_results::result[] to write start/end
+    * counter to
+    */
+   unsigned idx;
+
+   /*
+    * Below here, only used internally within autotune
+    */
+   uint64_t rp_key;
+   struct tu_renderpass_history *history;
+   struct list_head node;
+   uint32_t fence;
+   uint64_t samples_passed;
+};
+
+VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
+void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
+
+bool tu_autotune_use_bypass(struct tu_autotune *at,
+                            struct tu_cmd_buffer *cmd_buffer,
+                            struct tu_renderpass_result **autotune_result);
+void tu_autotune_free_results(struct list_head *results);
+
+bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
+                                       uint32_t cmd_buffer_count);
+
+/**
+ * A magic 8-ball that tells the gmem code whether we should do bypass mode
+ * for moar fps.
+ */
+struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
+                                    struct tu_autotune *at,
+                                    struct tu_cmd_buffer **cmd_buffers,
+                                    uint32_t cmd_buffer_count);
+
+
+#endif /* TU_AUTOTUNE_H */
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@ -592,7 +592,8 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
 }

 static bool
-use_sysmem_rendering(struct tu_cmd_buffer *cmd)
+use_sysmem_rendering(struct tu_cmd_buffer *cmd,
+                     struct tu_renderpass_result **autotune_result)
 {
   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
      return true;
@ -615,7 +616,13 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd)
   if (cmd->state.disable_gmem)
      return true;

-   return false;
+   bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
+                                            cmd, autotune_result);
+   if (*autotune_result) {
+      list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
+   }
+
+   return use_sysmem;
 }

 static void
@ -1210,7 +1217,50 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
 }

 static void
-tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_autotune_begin(struct tu_cs *cs, struct tu_autotune *at,
+                   const struct tu_renderpass_result *autotune_result)
+{
+   if (!autotune_result)
+      return;
+
+   uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS;
+   uint64_t begin_iova = autotune_results_ptr(at, result[result_idx].samples_start);
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
+
+   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+   tu_cs_emit(cs, ZPASS_DONE);
+}
+
+static void
+tu6_autotune_end(struct tu_cs *cs, struct tu_autotune *at,
+                 const struct tu_renderpass_result *autotune_result)
+{
+   if (!autotune_result)
+      return;
+
+   uint32_t result_idx = autotune_result->idx % TU_AUTOTUNE_MAX_RESULTS;
+   uint64_t end_iova = autotune_results_ptr(at, result[result_idx].samples_end);
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
+
+   tu_cs_emit_regs(cs,
+                   A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
+
+   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
+   tu_cs_emit(cs, ZPASS_DONE);
+
+   /* A fence would be emitted at the submission time */
+}
+
+static void
+tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                        const struct tu_renderpass_result *autotune_result)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;

@ -1240,12 +1290,17 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
   tu_cs_emit(cs, 0x0);

+   tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result);
+
   tu_cs_sanity_check(cs);
 }

 static void
-tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                      const struct tu_renderpass_result *autotune_result)
 {
+   tu6_autotune_end(cs, &cmd->device->autotune, autotune_result);
+
   /* Do any resolves of the last subpass. These are handled in the
    * tile_store_cs in the gmem path.
    */
@ -1262,7 +1317,8 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 }

 static void
-tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                      const struct tu_renderpass_result *autotune_result)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;

@ -1312,6 +1368,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
   }

+   tu6_autotune_begin(cs, &cmd->device->autotune, autotune_result);
+
   tu_cs_sanity_check(cs);
 }

@ -1340,8 +1398,11 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 }

 static void
-tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
+                    const struct tu_renderpass_result *autotune_result)
 {
+   tu6_autotune_end(cs, &cmd->device->autotune, autotune_result);
+
   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);

   tu_cs_emit_regs(cs,
@ -1355,11 +1416,12 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 }

 static void
-tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
+tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
+                    const struct tu_renderpass_result *autotune_result)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;

-   tu6_tile_render_begin(cmd, &cmd->cs);
+   tu6_tile_render_begin(cmd, &cmd->cs, autotune_result);

   uint32_t pipe = 0;
   for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
@ -1381,7 +1443,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
      }
   }

-   tu6_tile_render_end(cmd, &cmd->cs);
+   tu6_tile_render_end(cmd, &cmd->cs, autotune_result);

   trace_end_render_pass(&cmd->trace, &cmd->cs, fb);

@ -1391,9 +1453,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
 }

 static void
-tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
+tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
+                     const struct tu_renderpass_result *autotune_result)
 {
-   tu6_sysmem_render_begin(cmd, &cmd->cs);
+   tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result);

   trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs);

@ -1401,7 +1464,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)

   trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);

-   tu6_sysmem_render_end(cmd, &cmd->cs);
+   tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result);

   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
 }
@ -1442,7 +1505,9 @@ tu_create_cmd_buffer(struct tu_device *device,
      cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
   }

+
   u_trace_init(&cmd_buffer->trace, &device->trace_context);
+   list_inithead(&cmd_buffer->renderpass_autotune_results);

   tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
   tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
@ -1468,6 +1533,8 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)

   u_trace_fini(&cmd_buffer->trace);

+   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+
   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      if (cmd_buffer->descriptors[i].push_set.layout)
         tu_descriptor_set_layout_unref(cmd_buffer->device,
@ -1492,6 +1559,8 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
   tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
   tu_cs_reset(&cmd_buffer->sub_cs);

+   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+
   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
      memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
      if (cmd_buffer->descriptors[i].push_set.layout)
@ -3818,6 +3887,15 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
 {
   const struct tu_pipeline *pipeline = cmd->state.pipeline;

+   /* Fill draw stats for autotuner */
+   cmd->state.drawcall_count++;
+
+   cmd->state.total_drawcalls_cost += cmd->state.pipeline->drawcall_base_cost;
+   if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
+      cmd->state.total_drawcalls_cost++;
+   if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
+      cmd->state.total_drawcalls_cost++;
+
   tu_emit_cache_flush_renderpass(cmd, cs);

   bool primitive_restart_enabled = pipeline->ia.primitive_restart;
@ -4584,10 +4662,11 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,

   cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);

-   if (use_sysmem_rendering(cmd_buffer))
-      tu_cmd_render_sysmem(cmd_buffer);
+   struct tu_renderpass_result *autotune_result = NULL;
+   if (use_sysmem_rendering(cmd_buffer, &autotune_result))
+      tu_cmd_render_sysmem(cmd_buffer, autotune_result);
   else
-      tu_cmd_render_tiles(cmd_buffer);
+      tu_cmd_render_tiles(cmd_buffer, autotune_result);

   /* Outside of renderpasses we assume all draw states are disabled. We do
    * this outside the draw CS for the normal case where 3d gmem stores aren't
@ -4617,6 +4696,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
   cmd_buffer->state.has_tess = false;
   cmd_buffer->state.has_subpass_predication = false;
   cmd_buffer->state.disable_gmem = false;
+   cmd_buffer->state.drawcall_count = 0;
+   cmd_buffer->state.total_drawcalls_cost = 0;

   /* LRZ is not valid next time we use it */
   cmd_buffer->state.lrz.valid = false;
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@ -1810,6 +1810,11 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,

   device->mem_cache = tu_pipeline_cache_from_handle(pc);

+   result = tu_autotune_init(&device->autotune, device);
+   if (result != VK_SUCCESS) {
+      goto fail_timeline_cond;
+   }
+
   for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
      mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);

@ -1891,6 +1896,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
      free(device->perfcntrs_pass_cs);
   }

+   tu_autotune_fini(&device->autotune, device);
+
   pthread_cond_destroy(&device->timeline_cond);
   vk_free(&device->vk.alloc, device->bo_list);
   vk_free(&device->vk.alloc, device->bo_idx);
--- a/src/freedreno/vulkan/tu_drm.c
+++ b/src/freedreno/vulkan/tu_drm.c
@ -53,6 +53,8 @@ struct tu_queue_submit
   uint32_t nr_out_syncobjs;
   uint32_t entry_count;
   uint32_t perf_pass_index;
+
+   bool     autotune_fence;
 };

 struct tu_u_trace_syncobj
@ -746,8 +748,14 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
      }
   }

+
   memset(new_submit, 0, sizeof(struct tu_queue_submit));

+   new_submit->autotune_fence =
+      tu_autotune_submit_requires_fence(cmd_buffers, vk_submit->command_buffer_count);
+   if (new_submit->autotune_fence)
+      entry_count++;
+
   new_submit->cmds = vk_zalloc(&queue->device->vk.alloc,
         entry_count * sizeof(*new_submit->cmds), 8,
         VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
@ -818,9 +826,26 @@ tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit)
 }

 static void
-tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
-                                   struct tu_queue_submit *submit)
+tu_fill_msm_gem_submit(struct tu_device *dev,
+                       struct drm_msm_gem_submit_cmd *cmd,
+                       struct tu_cs_entry *cs_entry)
 {
+   cmd->type = MSM_SUBMIT_CMD_BUF;
+   cmd->submit_idx =
+      dev->bo_idx[cs_entry->bo->gem_handle];
+   cmd->submit_offset = cs_entry->offset;
+   cmd->size = cs_entry->size;
+   cmd->pad = 0;
+   cmd->nr_relocs = 0;
+   cmd->relocs = 0;
+}
+
+static void
+tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
+                                   struct tu_queue_submit *submit,
+                                   struct tu_cs *autotune_cs)
+{
+   struct tu_device *dev = queue->device;
   struct drm_msm_gem_submit_cmd *cmds = submit->cmds;

   struct vk_command_buffer **vk_cmd_buffers = submit->vk_submit->command_buffers;
@ -836,45 +861,27 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
         struct tu_cs_entry *perf_cs_entry =
            &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index];

-         cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
-         cmds[entry_idx].submit_idx =
-            dev->bo_idx[perf_cs_entry->bo->gem_handle];
-         cmds[entry_idx].submit_offset = perf_cs_entry->offset;
-         cmds[entry_idx].size = perf_cs_entry->size;
-         cmds[entry_idx].pad = 0;
-         cmds[entry_idx].nr_relocs = 0;
-         cmds[entry_idx++].relocs = 0;
+         tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry);
      }

      for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
-         cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
-         cmds[entry_idx].submit_idx =
-            dev->bo_idx[cs->entries[i].bo->gem_handle];
-         cmds[entry_idx].submit_offset = cs->entries[i].offset;
-         cmds[entry_idx].size = cs->entries[i].size;
-         cmds[entry_idx].pad = 0;
-         cmds[entry_idx].nr_relocs = 0;
-         cmds[entry_idx].relocs = 0;
+         tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]);
      }

      if (submit->u_trace_submission_data) {
         struct tu_cs *ts_cs =
            submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs;
         if (ts_cs) {
-            cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
-            cmds[entry_idx].submit_idx =
-               queue->device->bo_idx[ts_cs->entries[0].bo->gem_handle];
-
-            assert(cmds[entry_idx].submit_idx < queue->device->bo_count);
-
-            cmds[entry_idx].submit_offset = ts_cs->entries[0].offset;
-            cmds[entry_idx].size = ts_cs->entries[0].size;
-            cmds[entry_idx].pad = 0;
-            cmds[entry_idx].nr_relocs = 0;
-            cmds[entry_idx++].relocs = 0;
+            tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]);
         }
      }
   }
+
+   if (autotune_cs) {
+      assert(autotune_cs->entry_count == 1);
+      tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]);
+      entry_idx++;
+   }
 }

 static VkResult
@ -882,6 +889,15 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
 {
   queue->device->submit_count++;

+   struct tu_cs *autotune_cs = NULL;
+   if (submit->autotune_fence) {
+      struct tu_cmd_buffer **cmd_buffers = (void *)submit->vk_submit->command_buffers;
+      autotune_cs = tu_autotune_on_submit(queue->device,
+                                          &queue->device->autotune,
+                                          cmd_buffers,
+                                          submit->vk_submit->command_buffer_count);
+   }
+
   uint32_t flags = MSM_PIPE_3D0;

   if (submit->vk_submit->wait_count)
@ -896,7 +912,7 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
    * time when bo_mutex is not locked. So we build submit cmds here the real
    * place to submit.
    */
-   tu_queue_build_msm_gem_submit_cmds(queue, submit);
+   tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs);

   struct drm_msm_gem_submit req = {
      .flags = flags,
--- a/src/freedreno/vulkan/tu_kgsl.c
+++ b/src/freedreno/vulkan/tu_kgsl.c
@ -358,6 +358,10 @@ tu_QueueSubmit(VkQueue _queue,
            entry_count++;
      }

+      struct tu_cmd_buffer **cmd_buffers = (void *)submit->pCommandBuffers;
+      if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferCount))
+         entry_count++;
+
      max_entry_count = MAX2(max_entry_count, entry_count);
   }

@ -404,6 +408,22 @@ tu_QueueSubmit(VkQueue _queue,
         }
      }

+      struct tu_cmd_buffer **cmd_buffers = (void *)submit->pCommandBuffers;
+      if (tu_autotune_submit_requires_fence(cmd_buffers, submit->commandBufferCount)) {
+         struct tu_cs *autotune_cs =
+            tu_autotune_on_submit(queue->device,
+                                  &queue->device->autotune,
+                                  cmd_buffers,
+                                  submit->commandBufferCount);
+         cmds[entry_idx++] = (struct kgsl_command_object) {
+            .offset = autotune_cs->entries[0].offset,
+            .gpuaddr = autotune_cs->entries[0].bo->iova,
+            .size = autotune_cs->entries[0].size,
+            .flags = KGSL_CMDLIST_IB,
+            .id = autotune_cs->entries[0].bo->gem_handle,
+         };
+      }
+
      struct tu_syncobj s = sync_merge(submit->pWaitSemaphores,
                                       submit->waitSemaphoreCount,
                                       true, true);
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@ -1576,6 +1576,9 @@ tu6_emit_fs_outputs(struct tu_cs *cs,
          (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) {
         pipeline->lrz.force_late_z = true;
      }
+
+      pipeline->drawcall_base_cost +=
+         util_bitcount(fs_render_components) / util_bitcount(0xf);
   }
 }

@ -3121,6 +3124,10 @@ tu_pipeline_builder_parse_multisample_and_color_blend(
         if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) {
            pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
         }
+
+         if (blendAttachment.blendEnable) {
+            pipeline->drawcall_base_cost++;
+         }
      }
   }

--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -77,6 +77,7 @@
 #include "perfcntrs/freedreno_perfcntr.h"

 #include "tu_descriptor_set.h"
+#include "tu_autotune.h"
 #include "tu_util.h"
 #include "tu_perfetto.h"

@ -462,6 +463,8 @@ struct tu_device
   pthread_cond_t timeline_cond;
   pthread_mutex_t submit_mutex;

+   struct tu_autotune autotune;
+
 #ifdef ANDROID
   const void *gralloc;
   enum {
@ -1063,6 +1066,35 @@ struct tu_cmd_state
   bool disable_gmem;
   enum a5xx_line_mode line_mode;

+   uint32_t drawcall_count;
+
+   /* A calculated "draw cost" value for renderpass, which tries to
+    * estimate the bandwidth-per-sample of all the draws according
+    * to:
+    *
+    *    foreach_draw (...) {
+    *      cost += num_frag_outputs;
+    *      if (blend_enabled)
+    *        cost += num_blend_enabled;
+    *      if (depth_test_enabled)
+    *        cost++;
+    *      if (depth_write_enabled)
+    *        cost++;
+    *    }
+    *
+    * The idea is that each sample-passed minimally does one write
+    * per MRT.  If blend is enabled, the hw will additionally do
+    * a framebuffer read per sample-passed (for each MRT with blend
+    * enabled).  If depth-test is enabled, the hw will additionally
+    * a depth buffer read.  If depth-write is enable, the hw will
+    * additionally do a depth buffer write.
+    *
+    * This does ignore depth buffer traffic for samples which do not
+    * pass do to depth-test fail, and some other details.  But it is
+    * just intended to be a rough estimate that is easy to calculate.
+    */
+   uint32_t total_drawcalls_cost;
+
   struct tu_lrz_state lrz;

   struct tu_draw_state depth_plane_state;
@ -1102,6 +1134,8 @@ struct tu_cmd_buffer
   struct u_trace_iterator trace_renderpass_start;
   struct u_trace_iterator trace_renderpass_end;

+   struct list_head renderpass_autotune_results;
+
   VkCommandBufferUsageFlags usage_flags;
   VkCommandBufferLevel level;
   enum tu_cmd_buffer_status status;
@ -1300,6 +1334,9 @@ struct tu_pipeline

   struct tu_lrz_pipeline lrz;

+   /* Base drawcall cost for sysmem vs gmem autotuner */
+   uint8_t drawcall_base_cost;
+
   void *executables_mem_ctx;
   /* tu_pipeline_executable */
   struct util_dynarray executables;