turnip: support multipass for performance query.

To support multipass, querying perf counters happens in several steps below. 0) There's a scratch reg to set pass indices for perf counters query. Prepare cmd streams to set each pass index to the reg at device creation time. See tu_CreateDevice in tu_device.c 1) Emit command streams to read all requested perf counters at all passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which reads the scratch reg where pass index is set. 2) Pick the right cs setting proper pass index to the reg and prepend it to the command buffer at each submit time. 3) If the pass index in the reg is true, then executes the command stream below CP_COND_REG_EXEC. Would need to implement for kgsl in the future. Signed-off-by: Hyunjun Ko <zzoon@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6808>
2020-11-20 05:33:50 +00:00 · 2020-11-20 05:33:50 +00:00 · c921a6e98d
parent 937dd76426
commit c921a6e98d
4 changed files with 262 additions and 41 deletions
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@ -26,6 +26,7 @@
 */

 #include "tu_private.h"
+#include "tu_cs.h"

 #include <fcntl.h>
 #include <poll.h>
@ -1000,6 +1001,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   VkResult result;
   struct tu_device *device;
   bool custom_border_colors = false;
+   bool perf_query_pools = false;

   /* Check enabled features */
   if (pCreateInfo->pEnabledFeatures) {
@ -1024,6 +1026,12 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
         custom_border_colors = border_color_features->customBorderColors;
         break;
      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
+         const VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
+            (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
+         perf_query_pools = feature->performanceCounterQueryPools;
+         break;
+      }
      default:
         break;
      }
@ -1146,6 +1154,46 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
      goto fail_pipeline_cache;
   }

+   if (perf_query_pools) {
+      /* Prepare command streams setting pass index to the PERF_CNTRS_REG
+       * from 0 to 31. One of these will be picked up at cmd submit time
+       * when the perf query is executed.
+       */
+      struct tu_cs *cs;
+
+      if (!(device->perfcntrs_pass_cs = calloc(1, sizeof(struct tu_cs)))) {
+         result = vk_startup_errorf(device->instance,
+               VK_ERROR_OUT_OF_HOST_MEMORY, "OOM");
+         goto fail_perfcntrs_pass_alloc;
+      }
+
+      device->perfcntrs_pass_cs_entries = calloc(32, sizeof(struct tu_cs_entry));
+      if (!device->perfcntrs_pass_cs_entries) {
+         result = vk_startup_errorf(device->instance,
+               VK_ERROR_OUT_OF_HOST_MEMORY, "OOM");
+         goto fail_perfcntrs_pass_entries_alloc;
+      }
+
+      cs = device->perfcntrs_pass_cs;
+      tu_cs_init(cs, device, TU_CS_MODE_SUB_STREAM, 96);
+
+      for (unsigned i = 0; i < 32; i++) {
+         struct tu_cs sub_cs;
+
+         result = tu_cs_begin_sub_stream(cs, 3, &sub_cs);
+         if (result != VK_SUCCESS) {
+            vk_startup_errorf(device->instance, result,
+                  "failed to allocate commands streams");
+            goto fail_prepare_perfcntrs_pass_cs;
+         }
+
+         tu_cs_emit_regs(&sub_cs, A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG, 1 << i));
+         tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0);
+
+         device->perfcntrs_pass_cs_entries[i] = tu_cs_end_sub_stream(cs, &sub_cs);
+      }
+   }
+
   device->mem_cache = tu_pipeline_cache_from_handle(pc);

   for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
@ -1156,6 +1204,13 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   *pDevice = tu_device_to_handle(device);
   return VK_SUCCESS;

+fail_prepare_perfcntrs_pass_cs:
+   free(device->perfcntrs_pass_cs_entries);
+   tu_cs_finish(device->perfcntrs_pass_cs);
+fail_perfcntrs_pass_entries_alloc:
+   free(device->perfcntrs_pass_cs);
+fail_perfcntrs_pass_alloc:
+   tu_DestroyPipelineCache(tu_device_to_handle(device), pc, NULL);
 fail_pipeline_cache:
 fail_global_bo_map:
   tu_bo_finish(device, &device->global_bo);
@ -1200,6 +1255,12 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   VkPipelineCache pc = tu_pipeline_cache_to_handle(device->mem_cache);
   tu_DestroyPipelineCache(tu_device_to_handle(device), pc, NULL);

+   if (device->perfcntrs_pass_cs) {
+      free(device->perfcntrs_pass_cs_entries);
+      tu_cs_finish(device->perfcntrs_pass_cs);
+      free(device->perfcntrs_pass_cs);
+   }
+
   vk_free(&device->vk.alloc, device->bo_list);
   vk_free(&device->vk.alloc, device->bo_idx);
   vk_free(&device->vk.alloc, device);
--- a/src/freedreno/vulkan/tu_drm.c
+++ b/src/freedreno/vulkan/tu_drm.c
@ -638,6 +638,11 @@ tu_QueueSubmit(VkQueue _queue,
      const VkSubmitInfo *submit = pSubmits + i;
      const bool last_submit = (i == submitCount - 1);
      uint32_t out_syncobjs_size = submit->signalSemaphoreCount;
+
+      const VkPerformanceQuerySubmitInfoKHR *perf_info =
+         vk_find_struct_const(pSubmits[i].pNext,
+                              PERFORMANCE_QUERY_SUBMIT_INFO_KHR);
+
      if (last_submit && fence)
         out_syncobjs_size += 1;
      /* note: assuming there won't be any very large semaphore counts */
@ -671,6 +676,10 @@ tu_QueueSubmit(VkQueue _queue,
      uint32_t entry_count = 0;
      for (uint32_t j = 0; j < submit->commandBufferCount; ++j) {
         TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]);
+
+         if (perf_info)
+            entry_count++;
+
         entry_count += cmdbuf->cs.entry_count;
      }

@ -681,6 +690,20 @@ tu_QueueSubmit(VkQueue _queue,
      for (uint32_t j = 0; j < submit->commandBufferCount; ++j) {
         TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]);
         struct tu_cs *cs = &cmdbuf->cs;
+
+         if (perf_info) {
+            struct tu_cs_entry *perf_cs_entry =
+               &cmdbuf->device->perfcntrs_pass_cs_entries[perf_info->counterPassIndex];
+            cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
+            cmds[entry_idx].submit_idx =
+               queue->device->bo_idx[perf_cs_entry->bo->gem_handle];
+            cmds[entry_idx].submit_offset = perf_cs_entry->offset;
+            cmds[entry_idx].size = perf_cs_entry->size;
+            cmds[entry_idx].pad = 0;
+            cmds[entry_idx].nr_relocs = 0;
+            cmds[entry_idx++].relocs = 0;
+         }
+
         for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
            cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
            cmds[entry_idx].submit_idx =
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -394,6 +394,10 @@ struct tu_device
   uint32_t *bo_idx;
   uint32_t bo_count, bo_list_size, bo_idx_size;
   mtx_t bo_mutex;
+
+   /* Command streams to set pass index to a scratch reg */
+   struct tu_cs *perfcntrs_pass_cs;
+   struct tu_cs_entry *perfcntrs_pass_cs_entries;
 };

 VkResult _tu_device_set_lost(struct tu_device *device,
@ -1503,6 +1507,17 @@ struct tu_render_pass
   struct tu_subpass subpasses[0];
 };

+#define PERF_CNTRS_REG 4
+
+struct tu_perf_query_data
+{
+   uint32_t gid;      /* group-id */
+   uint32_t cid;      /* countable-id within the group */
+   uint32_t cntr_reg; /* counter register within the group */
+   uint32_t pass;     /* pass index that countables can be requested */
+   uint32_t app_idx;  /* index provided by apps */
+};
+
 struct tu_query_pool
 {
   struct vk_object_base base;
@ -1517,7 +1532,7 @@ struct tu_query_pool
   const struct fd_perfcntr_group *perf_group;
   uint32_t perf_group_count;
   uint32_t counter_index_count;
-   uint32_t counter_indices[0];
+   struct tu_perf_query_data perf_query_data[0];
 };

 uint32_t
--- a/src/freedreno/vulkan/tu_query.c
+++ b/src/freedreno/vulkan/tu_query.c
@ -195,18 +195,13 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
 {
   uint32_t i;

-   /* TODO. we should handle multipass to be able to get all countables.
-    * Until then apps can only the first n countables where n == num_counters.
-    *
-    * See tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR.
-    */
   for (i = 0; i < group_count; i++) {
-      if (group[i].num_counters > index) {
+      if (group[i].num_countables > index) {
         *gid = i;
         *cid = index;
         break;
      }
-      index -= group[i].num_counters;
+      index -= group[i].num_countables;

      assert(index >= 0);
   }
@ -214,6 +209,13 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
   assert(i < group_count);
 }

+static int
+compare_perfcntr_pass(const void *a, const void *b)
+{
+   return ((struct tu_perf_query_data *)a)->pass -
+          ((struct tu_perf_query_data *)b)->pass;
+}
+
 VkResult
 tu_CreateQueryPool(VkDevice _device,
                   const VkQueryPoolCreateInfo *pCreateInfo,
@ -249,8 +251,9 @@ tu_CreateQueryPool(VkDevice _device,
                  sizeof(struct perfcntr_query_slot) *
                  (perf_query_info->counterIndexCount - 1);

-      /* Size of the array pool->counter_indices */
-      pool_size += sizeof(uint32_t) * perf_query_info->counterIndexCount;
+      /* Size of the array pool->tu_perf_query_data */
+      pool_size += sizeof(struct tu_perf_query_data) *
+                   perf_query_info->counterIndexCount;
      break;
   }
   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
@ -272,8 +275,47 @@ tu_CreateQueryPool(VkDevice _device,

      pool->counter_index_count = perf_query_info->counterIndexCount;

-      for (uint32_t i = 0; i < pool->counter_index_count; i++)
-         pool->counter_indices[i] = perf_query_info->pCounterIndices[i];
+      /* Build all perf counters data that is requested, so we could get
+       * correct group id, countable id, counter register and pass index with
+       * only a counter index provided by applications at each command submit.
+       *
+       * Also, since this built data will be sorted by pass index later, we
+       * should keep the original indices and store perfcntrs results according
+       * to them so apps can get correct results with their own indices.
+       */
+      uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
+      memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
+      memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
+
+      for (uint32_t i = 0; i < pool->counter_index_count; i++) {
+         uint32_t gid = 0, cid = 0;
+
+         perfcntr_index(pool->perf_group, pool->perf_group_count,
+                        perf_query_info->pCounterIndices[i], &gid, &cid);
+
+         pool->perf_query_data[i].gid = gid;
+         pool->perf_query_data[i].cid = cid;
+         pool->perf_query_data[i].app_idx = i;
+
+         /* When a counter register is over the capacity(num_counters),
+          * reset it for next pass.
+          */
+         if (regs[gid] < pool->perf_group[gid].num_counters) {
+            pool->perf_query_data[i].cntr_reg = regs[gid]++;
+            pool->perf_query_data[i].pass = pass[gid];
+         } else {
+            pool->perf_query_data[i].pass = ++pass[gid];
+            pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
+            regs[gid]++;
+         }
+      }
+
+      /* Sort by pass index so we could easily prepare a command stream
+       * with the ascending order of pass index.
+       */
+      qsort(pool->perf_query_data, pool->counter_index_count,
+            sizeof(pool->perf_query_data[0]),
+            compare_perfcntr_pass);
   }

   VkResult result = tu_bo_init_new(device, &pool->bo,
@ -798,44 +840,89 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
   tu_cs_emit_qw(cs, begin_iova);
 }

+static void
+emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
+{
+   tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
+   tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
+                        REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
+                  A6XX_CP_REG_TEST_0_BIT(pass) |
+                  A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
+   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
+}
+
 static void
 emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
                           struct tu_query_pool *pool,
                           uint32_t query)
 {
   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
-   uint32_t gid = 0, cid = 0;
+   uint32_t last_pass = ~0;
+
+   /* Querying perf counters happens in these steps:
+    *
+    *  0) There's a scratch reg to set a pass index for perf counters query.
+    *     Prepare cmd streams to set each pass index to the reg at device
+    *     creation time. See tu_CreateDevice in tu_device.c
+    *  1) Emit command streams to read all requested perf counters at all
+    *     passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
+    *     reads the scratch reg where pass index is set.
+    *     See emit_perfcntrs_pass_start.
+    *  2) Pick the right cs setting proper pass index to the reg and prepend
+    *     it to the command buffer at each submit time.
+    *     See tu_QueueSubmit in tu_drm.c
+    *  3) If the pass index in the reg is true, then executes the command
+    *     stream below CP_COND_REG_EXEC.
+    */

   tu_cs_emit_wfi(cs);

   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
-      perfcntr_index(pool->perf_group, pool->perf_group_count,
-                     pool->counter_indices[i], &gid, &cid);
+      struct tu_perf_query_data *data = &pool->perf_query_data[i];
+
+      if (last_pass != data->pass) {
+         last_pass = data->pass;
+
+         if (data->pass != 0)
+            tu_cond_exec_end(cs);
+         emit_perfcntrs_pass_start(cs, data->pass);
+      }

      const struct fd_perfcntr_counter *counter =
-            &pool->perf_group[gid].counters[cid];
+            &pool->perf_group[data->gid].counters[data->cntr_reg];
      const struct fd_perfcntr_countable *countable =
-            &pool->perf_group[gid].countables[cid];
+            &pool->perf_group[data->gid].countables[data->cid];

      tu_cs_emit_pkt4(cs, counter->select_reg, 1);
      tu_cs_emit(cs, countable->selector);
   }
+   tu_cond_exec_end(cs);

+   last_pass = ~0;
   tu_cs_emit_wfi(cs);

   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
-      perfcntr_index(pool->perf_group, pool->perf_group_count,
-                     pool->counter_indices[i], &gid, &cid);
+      struct tu_perf_query_data *data = &pool->perf_query_data[i];
+
+      if (last_pass != data->pass) {
+         last_pass = data->pass;
+
+         if (data->pass != 0)
+            tu_cond_exec_end(cs);
+         emit_perfcntrs_pass_start(cs, data->pass);
+      }

      const struct fd_perfcntr_counter *counter =
-            &pool->perf_group[gid].counters[cid];
-      uint64_t begin_iova = perf_query_iova(pool, query, begin, i);
+            &pool->perf_group[data->gid].counters[data->cntr_reg];
+
+      uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);

      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
                     CP_REG_TO_MEM_0_64B);
      tu_cs_emit_qw(cs, begin_iova);
   }
+   tu_cond_exec_end(cs);
 }

 static void
@ -1035,39 +1122,56 @@ emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
                         uint32_t query)
 {
   struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
-   uint64_t begin_iova;
-   uint64_t end_iova;
   uint64_t available_iova = query_available_iova(pool, query);
+   uint64_t end_iova;
+   uint64_t begin_iova;
   uint64_t result_iova;
-   uint32_t gid = 0, cid = 0;
-
-   tu_cs_emit_wfi(cs);
+   uint32_t last_pass = ~0;

   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
-      perfcntr_index(pool->perf_group, pool->perf_group_count,
-                     pool->counter_indices[i], &gid, &cid);
+      struct tu_perf_query_data *data = &pool->perf_query_data[i];
+
+      if (last_pass != data->pass) {
+         last_pass = data->pass;
+
+         if (data->pass != 0)
+            tu_cond_exec_end(cs);
+         emit_perfcntrs_pass_start(cs, data->pass);
+      }

      const struct fd_perfcntr_counter *counter =
-            &pool->perf_group[gid].counters[cid];
-      end_iova = perf_query_iova(pool, query, end, i);
+            &pool->perf_group[data->gid].counters[data->cntr_reg];
+
+      end_iova = perf_query_iova(pool, 0, end, data->app_idx);

      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
                     CP_REG_TO_MEM_0_64B);
      tu_cs_emit_qw(cs, end_iova);
   }
+   tu_cond_exec_end(cs);

+   last_pass = ~0;
   tu_cs_emit_wfi(cs);

   for (uint32_t i = 0; i < pool->counter_index_count; i++) {
-      perfcntr_index(pool->perf_group, pool->perf_group_count,
-                     pool->counter_indices[i], &gid, &cid);
+      struct tu_perf_query_data *data = &pool->perf_query_data[i];

-      result_iova = query_result_iova(pool, query,
-                                      struct perfcntr_query_slot, i);
-      begin_iova = perf_query_iova(pool, query, begin, i);
-      end_iova = perf_query_iova(pool, query, end, i);
+      if (last_pass != data->pass) {
+         last_pass = data->pass;

+
+         if (data->pass != 0)
+            tu_cond_exec_end(cs);
+         emit_perfcntrs_pass_start(cs, data->pass);
+      }
+
+      result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
+             data->app_idx);
+      begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
+      end_iova = perf_query_iova(pool, 0, end, data->app_idx);
+
+      /* result += end - begin */
      tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
      tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
                     CP_MEM_TO_MEM_0_DOUBLE |
@ -1078,6 +1182,7 @@ emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
      tu_cs_emit_qw(cs, end_iova);
      tu_cs_emit_qw(cs, begin_iova);
   }
+   tu_cond_exec_end(cs);

   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);

@ -1324,11 +1429,8 @@ tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
   VK_OUTARRAY_MAKE(out, pCounters, pCounterCount);
   VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count);

-   /* TODO. we should handle multipass to be able to get all countables.
-    * Until then apps can only the first n countables where n == num_counters.
-    */
   for (int i = 0; i < group_count; i++) {
-      for (int j = 0; j < group[i].num_counters; j++) {
+      for (int j = 0; j < group[i].num_countables; j++) {

         vk_outarray_append(&out, counter) {
            counter->scope = VK_QUERY_SCOPE_COMMAND_BUFFER_KHR;
@ -1366,8 +1468,28 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
      const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
      uint32_t*                                   pNumPasses)
 {
-   /* TODO. Should support handling multipass. */
+   TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
+   uint32_t group_count = 0;
+   uint32_t gid = 0, cid = 0, n_passes;
+   const struct fd_perfcntr_group *group =
+         fd_perfcntrs(phydev->gpu_id, &group_count);
+
+   uint32_t counters_requested[group_count];
+   memset(counters_requested, 0x0, sizeof(counters_requested));
   *pNumPasses = 1;
+
+   for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
+      perfcntr_index(group, group_count,
+                     pPerformanceQueryCreateInfo->pCounterIndices[i],
+                     &gid, &cid);
+
+      counters_requested[gid]++;
+   }
+
+   for (uint32_t i = 0; i < group_count; i++) {
+      n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
+      *pNumPasses = MAX2(*pNumPasses, n_passes);
+   }
 }

 VkResult