/*
 * Copyright © 2021 Igalia S.L.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <vulkan/vulkan_core.h>

#include "tu_autotune.h"
#include "tu_private.h"
#include "tu_cs.h"

/* How does it work?
 *
 * - For each renderpass we calculate the number of samples passed
 *   by storing the number before and after in GPU memory.
 * - To store the values each command buffer holds GPU memory which
 *   expands with more renderpasses being written.
 * - For each renderpass we create tu_renderpass_result entry which
 *   points to the results in GPU memory.
 *   - Later on tu_renderpass_result would be added to the
 *     tu_renderpass_history entry which aggregate results for a
 *     given renderpass.
 * - On submission:
 *   - Process results which fence was signalled.
 *   - Free per-submission data which we now don't need.
 *
 *   - Create a command stream to write a fence value. This way we would
 *     know when we could safely read the results.
 *   - We cannot rely on the command buffer's lifetime when referencing
 *     its resources since the buffer could be destroyed before we process
 *     the results.
 *   - For each command buffer:
 *     - Reference its GPU memory.
 *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
 *
 * Since the command buffers could be recorded on different threads
 * we have to maintaining some amount of locking history table,
 * however we change the table only in a single thread at the submission
 * time, so in most cases there will be no locking.
 */

void
tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);

#define TU_AUTOTUNE_DEBUG_LOG 0
/* Dump history entries on autotuner finish,
 * could be used to gather data from traces.
 */
#define TU_AUTOTUNE_LOG_AT_FINISH 0

/* How many last renderpass stats are taken into account. */
#define MAX_HISTORY_RESULTS 5
/* For how many submissions we store renderpass stats. */
#define MAX_HISTORY_LIFETIME 128


/**
 * Tracks results for a given renderpass key
 */
struct tu_renderpass_history {
   uint64_t key;

   /* We would delete old history entries */
   uint32_t last_fence;

   /**
    * List of recent fd_renderpass_result's
    */
   struct list_head results;
   uint32_t num_results;

   uint32_t avg_samples;
};

/* Holds per-submission cs which writes the fence. */
struct tu_submission_data {
   struct list_head node;
   uint32_t fence;

   struct tu_cs fence_cs;
   uint32_t buffers_count;
};

static struct tu_submission_data *
create_submission_data(struct tu_device *dev, struct tu_autotune *at)
{
   struct tu_submission_data *submission_data =
      calloc(1, sizeof(struct tu_submission_data));
   submission_data->fence = at->fence_counter;

   struct tu_cs* fence_cs = &submission_data->fence_cs;
   tu_cs_init(fence_cs, dev, TU_CS_MODE_GROW, 5);
   tu_cs_begin(fence_cs);

   tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
   tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
   tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
   tu_cs_emit(fence_cs, at->fence_counter);

   tu_cs_end(fence_cs);

   list_addtail(&submission_data->node, &at->pending_submission_data);

   return submission_data;
}

static void
free_submission_data(struct tu_submission_data *data)
{
   list_del(&data->node);
   tu_cs_finish(&data->fence_cs);

   free(data);
}

#define APPEND_TO_HASH(state, field) \
   XXH64_update(state, &field, sizeof(field));

static uint64_t
hash_renderpass_instance(const struct tu_render_pass *pass,
                         const struct tu_framebuffer *framebuffer,
                         const struct tu_cmd_buffer *cmd) {
   XXH64_state_t hash_state;
   XXH64_reset(&hash_state, 0);

   APPEND_TO_HASH(&hash_state, framebuffer->width);
   APPEND_TO_HASH(&hash_state, framebuffer->height);
   APPEND_TO_HASH(&hash_state, framebuffer->layers);

   APPEND_TO_HASH(&hash_state, pass->attachment_count);
   XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));

   for (unsigned i = 0; i < pass->attachment_count; i++) {
      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.format);
      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.array_layers);
      APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.mip_levels);
   }

   APPEND_TO_HASH(&hash_state, pass->subpass_count);
   for (unsigned i = 0; i < pass->subpass_count; i++) {
      APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
      APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
      APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
      APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
   }

   return XXH64_digest(&hash_state);
}

static void
free_result(struct tu_device *dev, struct tu_renderpass_result *result)
{
   tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
   list_del(&result->node);
   free(result);
}

static void
free_history(struct tu_device *dev, struct tu_renderpass_history *history)
{
   tu_autotune_free_results_locked(dev, &history->results);
   free(history);
}

static bool
get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
{
   bool has_history = false;

   /* If the lock contantion would be found in the wild -
    * we could use try_lock here.
    */
   u_rwlock_rdlock(&at->ht_lock);
   struct hash_entry *entry =
      _mesa_hash_table_search(at->ht, &rp_key);
   if (entry) {
      struct tu_renderpass_history *history = entry->data;
      if (history->num_results > 0) {
         *avg_samples = p_atomic_read(&history->avg_samples);
         has_history = true;
      }
   }
   u_rwlock_rdunlock(&at->ht_lock);

   return has_history;
}

static struct tu_renderpass_result *
create_history_result(struct tu_autotune *at, uint64_t rp_key)
{
   struct tu_renderpass_result *result = calloc(1, sizeof(*result));
   result->rp_key = rp_key;

   return result;
}

static void
history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
                      struct tu_renderpass_result *result)
{
   list_delinit(&result->node);
   list_add(&result->node, &history->results);

   if (history->num_results < MAX_HISTORY_RESULTS) {
      history->num_results++;
   } else {
      /* Once above the limit, start popping old results off the
       * tail of the list:
       */
      struct tu_renderpass_result *old_result =
         list_last_entry(&history->results, struct tu_renderpass_result, node);
      mtx_lock(&dev->autotune_mutex);
      free_result(dev, old_result);
      mtx_unlock(&dev->autotune_mutex);
   }

   /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
   uint32_t total_samples = 0;
   list_for_each_entry(struct tu_renderpass_result, result,
                       &history->results, node) {
      total_samples += result->samples_passed;
   }

   float avg_samples = (float)total_samples / (float)history->num_results;
   p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
}

static void
process_results(struct tu_autotune *at)
{
   struct tu_device *dev = at->device;
   struct tu6_global *global = dev->global_bo->map;
   uint32_t current_fence = global->autotune_fence;

   list_for_each_entry_safe(struct tu_renderpass_result, result,
                            &at->pending_results, node) {
      if (result->fence > current_fence)
         break;

      struct tu_renderpass_history *history = result->history;
      result->samples_passed =
         result->samples->samples_end - result->samples->samples_start;

      history_add_result(dev, history, result);
   }

   list_for_each_entry_safe(struct tu_submission_data, submission_data,
                            &at->pending_submission_data, node) {
      if (submission_data->fence > current_fence)
         break;

      free_submission_data(submission_data);
   }
}

static void
queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
{
   bool one_time_submit = cmdbuf->usage_flags &
         VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;

   if (one_time_submit) {
      /* We can just steal the list since it won't be resubmitted again */
      list_splicetail(&cmdbuf->renderpass_autotune_results,
                        &at->pending_results);
      list_inithead(&cmdbuf->renderpass_autotune_results);
   } else {
      list_for_each_entry_safe(struct tu_renderpass_result, result,
                              &cmdbuf->renderpass_autotune_results, node) {
         /* TODO: copying each result isn't nice */
         struct tu_renderpass_result *copy = malloc(sizeof(*result));
         *copy = *result;
         tu_bo_get_ref(copy->bo.bo);
         list_addtail(&copy->node, &at->pending_results);
      }
   }
}

struct tu_cs *
tu_autotune_on_submit(struct tu_device *dev,
                      struct tu_autotune *at,
                      struct tu_cmd_buffer **cmd_buffers,
                      uint32_t cmd_buffer_count)
{
   /* We are single-threaded here */

   process_results(at);

   /* pre-increment so zero isn't valid fence */
   uint32_t new_fence = ++at->fence_counter;
   uint32_t result_buffers = 0;

   /* Create history entries here to minimize work and locking being
    * done on renderpass end.
    */
   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
      list_for_each_entry_safe(struct tu_renderpass_result, result,
                          &cmdbuf->renderpass_autotune_results, node) {
         struct tu_renderpass_history *history;
         struct hash_entry *entry =
            _mesa_hash_table_search(at->ht, &result->rp_key);
         if (!entry) {
            history = calloc(1, sizeof(*history));
            history->key = result->rp_key;
            list_inithead(&history->results);

            u_rwlock_wrlock(&at->ht_lock);
            _mesa_hash_table_insert(at->ht, &history->key, history);
            u_rwlock_wrunlock(&at->ht_lock);
         } else {
            history = (struct tu_renderpass_history *) entry->data;
         }

         history->last_fence = new_fence;

         result->fence = new_fence;
         result->history = history;
      }

      if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
         result_buffers++;
      }
   }

   struct tu_submission_data *submission_data =
      create_submission_data(dev, at);
   submission_data->buffers_count = result_buffers;

   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
      if (list_is_empty(&cmdbuf->renderpass_autotune_results))
         continue;

      queue_pending_results(at, cmdbuf);
   }

   if (TU_AUTOTUNE_DEBUG_LOG)
      mesa_logi("Total history entries: %u", at->ht->entries);

   /* Cleanup old entries from history table. The assumption
    * here is that application doesn't hold many old unsubmitted
    * command buffers, otherwise this table may grow big.
    */
   hash_table_foreach(at->ht, entry) {
      struct tu_renderpass_history *history = entry->data;
      if (history->last_fence == 0 ||
          (new_fence - history->last_fence) <= MAX_HISTORY_LIFETIME)
         continue;

      if (TU_AUTOTUNE_DEBUG_LOG)
         mesa_logi("Removed old history entry %016"PRIx64"", history->key);

      u_rwlock_wrlock(&at->ht_lock);
      _mesa_hash_table_remove_key(at->ht, &history->key);
      u_rwlock_wrunlock(&at->ht_lock);

      mtx_lock(&dev->autotune_mutex);
      free_history(dev, history);
      mtx_unlock(&dev->autotune_mutex);
   }

   return &submission_data->fence_cs;
}

static bool
renderpass_key_equals(const void *_a, const void *_b)
{
   return *(uint64_t *)_a == *(uint64_t *)_b;
}

static uint32_t
renderpass_key_hash(const void *_a)
{
   return *((uint64_t *) _a) & 0xffffffff;
}

VkResult
tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
{
   at->enabled = true;
   at->device = dev;
   at->ht = _mesa_hash_table_create(NULL,
                                    renderpass_key_hash,
                                    renderpass_key_equals);
   u_rwlock_init(&at->ht_lock);

   list_inithead(&at->pending_results);
   list_inithead(&at->pending_submission_data);

   return VK_SUCCESS;
}

void
tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
{
   if (TU_AUTOTUNE_LOG_AT_FINISH) {
      while (!list_is_empty(&at->pending_results)) {
         process_results(at);
      }

      hash_table_foreach(at->ht, entry) {
         struct tu_renderpass_history *history = entry->data;

         mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u",
                   history->key, history->avg_samples, history->num_results);
      }
   }

   tu_autotune_free_results(dev, &at->pending_results);

   mtx_lock(&dev->autotune_mutex);
   hash_table_foreach(at->ht, entry) {
      struct tu_renderpass_history *history = entry->data;
      free_history(dev, history);
   }
   mtx_unlock(&dev->autotune_mutex);

   list_for_each_entry_safe(struct tu_submission_data, submission_data,
                            &at->pending_submission_data, node) {
      free_submission_data(submission_data);
   }

   _mesa_hash_table_destroy(at->ht, NULL);
   u_rwlock_destroy(&at->ht_lock);
}

bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
                                  uint32_t cmd_buffer_count)
{
   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
      struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
      if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
         return true;
   }

   return false;
}

void
tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
{
   list_for_each_entry_safe(struct tu_renderpass_result, result,
                            results, node) {
      free_result(dev, result);
   }
}

void
tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
{
   mtx_lock(&dev->autotune_mutex);
   tu_autotune_free_results_locked(dev, results);
   mtx_unlock(&dev->autotune_mutex);
}

static bool
fallback_use_bypass(const struct tu_render_pass *pass,
                    const struct tu_framebuffer *framebuffer,
                    const struct tu_cmd_buffer *cmd_buffer)
{
   if (cmd_buffer->state.rp.drawcall_count > 5)
      return false;

   for (unsigned i = 0; i < pass->subpass_count; i++) {
      if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
         return false;
   }

   return true;
}

static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
{
   const VkExtent2D *extent = &cmd->state.render_area.extent;
   return extent->width * extent->height;
}

static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
                            uint32_t avg_renderpass_sample_count)
{
   const struct tu_cmd_state *state = &cmd->state;

   if (!state->rp.drawcall_count)
      return 0;

   /* sample count times drawcall_bandwidth_per_sample */
   return (uint64_t)avg_renderpass_sample_count *
      state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
}

bool
tu_autotune_use_bypass(struct tu_autotune *at,
                       struct tu_cmd_buffer *cmd_buffer,
                       struct tu_renderpass_result **autotune_result)
{
   const struct tu_render_pass *pass = cmd_buffer->state.pass;
   const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;

   for (unsigned i = 0; i < pass->subpass_count; i++) {
      const struct tu_subpass *subpass = &pass->subpasses[i];
      /* GMEM works much faster in this case */
      if (subpass->raster_order_attachment_access)
         return false;

      /* Would be very slow in sysmem mode because we have to enable
       * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE)
       */
      if (subpass->feedback_loop_color || subpass->feedback_loop_ds)
         return false;
   }

   /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
    * we would have to allocate GPU memory at the submit time and copy
    * results into it.
    * Native games ususally don't use it, Zink and DXVK don't use it,
    * D3D12 doesn't have such concept.
    */
   bool simultaneous_use =
      cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;

   if (!at->enabled || simultaneous_use)
      return fallback_use_bypass(pass, framebuffer, cmd_buffer);

   /* We use 64bit hash as a key since we don't fear rare hash collision,
    * the worst that would happen is sysmem being selected when it should
    * have not, and with 64bit it would be extremely rare.
    *
    * Q: Why not make the key from framebuffer + renderpass pointers?
    * A: At least DXVK creates new framebuffers each frame while keeping
    *    renderpasses the same. Also we want to support replaying a single
    *    frame in a loop for testing.
    */
   uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);

   *autotune_result = create_history_result(at, renderpass_key);

   uint32_t avg_samples = 0;
   if (get_history(at, renderpass_key, &avg_samples)) {
      const uint32_t pass_pixel_count =
         get_render_pass_pixel_count(cmd_buffer);
      uint64_t sysmem_bandwidth =
         (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
      uint64_t gmem_bandwidth =
         (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;

      const uint64_t total_draw_call_bandwidth =
         estimate_drawcall_bandwidth(cmd_buffer, avg_samples);

      /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
      sysmem_bandwidth += total_draw_call_bandwidth;

      /* drawcalls access gmem in gmem rendering, but we do not want to ignore
       * them completely.  The state changes between tiles also have an
       * overhead.  The magic numbers of 11 and 10 are randomly chosen.
       */
      gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;

      const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
      if (TU_AUTOTUNE_DEBUG_LOG) {
         const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
         const float drawcall_bandwidth_per_sample =
            (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
            cmd_buffer->state.rp.drawcall_count;

         mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
               renderpass_key,
               cmd_buffer->state.rp.drawcall_count,
               select_sysmem ? "sysmem" : "gmem");
         mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
               avg_samples,
               drawcall_bandwidth_per_sample,
               total_draw_call_bandwidth);
         mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
               extent->width, extent->height,
               pass->sysmem_bandwidth_per_pixel,
               pass->gmem_bandwidth_per_pixel);
         mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
               sysmem_bandwidth, gmem_bandwidth);
      }

      return select_sysmem;
   }

   return fallback_use_bypass(pass, framebuffer, cmd_buffer);
}

void
tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                             struct tu_cs *cs,
                             struct tu_renderpass_result *autotune_result)
{
   if (!autotune_result)
      return;

   struct tu_device *dev = cmd->device;

   static const uint32_t size = sizeof(struct tu_renderpass_samples);

   mtx_lock(&dev->autotune_mutex);
   VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
   mtx_unlock(&dev->autotune_mutex);
   if (ret != VK_SUCCESS) {
      autotune_result->bo.iova = 0;
      return;
   }

   uint64_t result_iova = autotune_result->bo.iova;

   autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);

   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));

   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));

   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
   tu_cs_emit(cs, ZPASS_DONE);
}

void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
                                struct tu_cs *cs,
                                struct tu_renderpass_result *autotune_result)
{
   if (!autotune_result)
      return;

   if (!autotune_result->bo.iova)
      return;

   uint64_t result_iova = autotune_result->bo.iova +
                          offsetof(struct tu_renderpass_samples, samples_end);

   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));

   tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));

   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
   tu_cs_emit(cs, ZPASS_DONE);
}