radv: Add caching for RT pipelines.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12592>
2021-08-27 04:20:59 +02:00 · 2021-08-27 04:20:59 +02:00 · ca2d96db51
parent a22a4162d9
commit ca2d96db51
5 changed files with 139 additions and 49 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -212,8 +212,8 @@ radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
   radv_pipeline_destroy(device, pipeline, pAllocator);
 }

-static uint32_t
-get_hash_flags(const struct radv_device *device, bool stats)
+uint32_t
+radv_get_hash_flags(const struct radv_device *device, bool stats)
 {
   uint32_t hash_flags = 0;

@ -3348,7 +3348,7 @@ VkResult
 radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
                    struct radv_pipeline_cache *cache, const struct radv_pipeline_key *pipeline_key,
                    const VkPipelineShaderStageCreateInfo **pStages,
-                    const VkPipelineCreateFlags flags,
+                    const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
                    VkPipelineCreationFeedbackEXT *pipeline_feedback,
                    VkPipelineCreationFeedbackEXT **stage_feedbacks)
 {
@ -3368,6 +3368,9 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
                              (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
                              device->keep_shader_info;
   bool disable_optimizations = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;
+   struct radv_pipeline_shader_stack_size **stack_sizes =
+      pipeline->type == RADV_PIPELINE_COMPUTE ? &pipeline->compute.rt_stack_sizes : NULL;
+   uint32_t *num_stack_sizes = stack_sizes ? &pipeline->compute.group_count : NULL;

   radv_start_feedback(pipeline_feedback);

@ -3384,8 +3387,12 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
      }
   }

-   radv_hash_shaders(hash, pStages, pipeline->layout, pipeline_key,
-                     get_hash_flags(device, keep_statistic_info));
+   if (custom_hash)
+      memcpy(hash, custom_hash, 20);
+   else {
+      radv_hash_shaders(hash, pStages, pipeline->layout, pipeline_key,
+                        radv_get_hash_flags(device, keep_statistic_info));
+   }
   memcpy(gs_copy_hash, hash, 20);
   gs_copy_hash[0] ^= 1;

@ -3394,13 +3401,14 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
   bool found_in_application_cache = true;
   if (modules[MESA_SHADER_GEOMETRY] && !keep_executable_info) {
      struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0};
-      radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants,
-                                                      &found_in_application_cache);
+      radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants, NULL,
+                                                      NULL, &found_in_application_cache);
      pipeline->gs_copy_shader = variants[MESA_SHADER_GEOMETRY];
   }

   if (!keep_executable_info &&
       radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders,
+                                                       stack_sizes, num_stack_sizes,
                                                       &found_in_application_cache) &&
       (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader)) {
      radv_stop_feedback(pipeline_feedback, found_in_application_cache);
@ -3629,7 +3637,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
         gs_binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary;
         gs_variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader;

-         radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries);
+         radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries,
+                                            NULL, 0);

         pipeline->gs_copy_shader = gs_variants[MESA_SHADER_GEOMETRY];
      }
@ -3698,7 +3707,9 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
   }

   if (!keep_executable_info) {
-      radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries);
+      radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries,
+                                         stack_sizes ? *stack_sizes : NULL,
+                                         num_stack_sizes ? *num_stack_sizes : 0);
   }

   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
@ -5519,7 +5530,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
   struct radv_pipeline_key key =
      radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend);

-   result = radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags,
+   result = radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags, NULL,
                                pipeline_feedback, stage_feedbacks);
   if (result != VK_SUCCESS)
      return result;
@ -5746,7 +5757,9 @@ radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
 VkResult
 radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
                             const VkComputePipelineCreateInfo *pCreateInfo,
-                             const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
+                             const VkAllocationCallbacks *pAllocator, const uint8_t *custom_hash,
+                             struct radv_pipeline_shader_stack_size *rt_stack_sizes,
+                             uint32_t rt_group_count, VkPipeline *pPipeline)
 {
   RADV_FROM_HANDLE(radv_device, device, _device);
   RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
@ -5759,8 +5772,10 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,

   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (pipeline == NULL)
+   if (pipeline == NULL) {
+      free(rt_stack_sizes);
      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }

   vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
   pipeline->type = RADV_PIPELINE_COMPUTE;
@ -5768,6 +5783,8 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
   pipeline->device = device;
   pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
   pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
+   pipeline->compute.rt_stack_sizes = rt_stack_sizes;
+   pipeline->compute.group_count = rt_group_count;
   assert(pipeline->layout);

   const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
@ -5784,7 +5801,7 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
   struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo);

   result = radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags,
-                                pipeline_feedback, stage_feedbacks);
+                                custom_hash, pipeline_feedback, stage_feedbacks);
   if (result != VK_SUCCESS) {
      radv_pipeline_destroy(device, pipeline, pAllocator);
      return result;
@ -5813,8 +5830,8 @@ radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uin
   unsigned i = 0;
   for (; i < count; i++) {
      VkResult r;
-      r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator,
-                                       &pPipelines[i]);
+      r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, NULL,
+                                       NULL, 0, &pPipelines[i]);
      if (r != VK_SUCCESS) {
         result = r;
         pPipelines[i] = VK_NULL_HANDLE;
--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@ -37,6 +37,7 @@ struct cache_entry {
      uint32_t sha1_dw[5];
   };
   uint32_t binary_sizes[MESA_SHADER_STAGES];
+   uint32_t num_stack_sizes;
   struct radv_shader_variant *variants[MESA_SHADER_STAGES];
   char code[0];
 };
@ -139,6 +140,39 @@ radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **s
   _mesa_sha1_final(&ctx, hash);
 }

+void
+radv_hash_rt_shaders(unsigned char *hash, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                     uint32_t flags)
+{
+   RADV_FROM_HANDLE(radv_pipeline_layout, layout, pCreateInfo->layout);
+   struct mesa_sha1 ctx;
+
+   _mesa_sha1_init(&ctx);
+   if (layout)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+   for (uint32_t i = 0; i < pCreateInfo->stageCount; ++i) {
+      RADV_FROM_HANDLE(vk_shader_module, module, pCreateInfo->pStages[i].module);
+      const VkSpecializationInfo *spec_info = pCreateInfo->pStages[i].pSpecializationInfo;
+
+      _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
+      _mesa_sha1_update(&ctx, pCreateInfo->pStages[i].pName, strlen(pCreateInfo->pStages[i].pName));
+      if (spec_info && spec_info->mapEntryCount) {
+         _mesa_sha1_update(&ctx, spec_info->pMapEntries,
+                           spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
+         _mesa_sha1_update(&ctx, spec_info->pData, spec_info->dataSize);
+      }
+   }
+
+   _mesa_sha1_update(&ctx, pCreateInfo->pGroups,
+                     pCreateInfo->groupCount * sizeof(*pCreateInfo->pGroups));
+
+   if (!radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo))
+      _mesa_sha1_update(&ctx, &pCreateInfo->maxPipelineRayRecursionDepth, 4);
+   _mesa_sha1_update(&ctx, &flags, 4);
+   _mesa_sha1_final(&ctx, hash);
+}
+
 static struct cache_entry *
 radv_pipeline_cache_search_unlocked(struct radv_pipeline_cache *cache, const unsigned char *sha1)
 {
@ -253,11 +287,10 @@ radv_is_cache_disabled(struct radv_device *device)
 }

 bool
-radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
-                                                struct radv_pipeline_cache *cache,
-                                                const unsigned char *sha1,
-                                                struct radv_shader_variant **variants,
-                                                bool *found_in_application_cache)
+radv_create_shader_variants_from_pipeline_cache(
+   struct radv_device *device, struct radv_pipeline_cache *cache, const unsigned char *sha1,
+   struct radv_shader_variant **variants, struct radv_pipeline_shader_stack_size **stack_sizes,
+   uint32_t *num_stack_sizes, bool *found_in_application_cache)
 {
   struct cache_entry *entry;

@ -325,6 +358,14 @@ radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,

   memcpy(variants, entry->variants, sizeof(entry->variants));

+   if (num_stack_sizes) {
+      *num_stack_sizes = entry->num_stack_sizes;
+      if (entry->num_stack_sizes) {
+         *stack_sizes = malloc(entry->num_stack_sizes * sizeof(**stack_sizes));
+         memcpy(*stack_sizes, p, entry->num_stack_sizes * sizeof(**stack_sizes));
+      }
+   }
+
   if (device->instance->debug_flags & RADV_DEBUG_NO_MEMORY_CACHE && cache == device->mem_cache)
      vk_free(&cache->alloc, entry);
   else {
@ -340,7 +381,9 @@ radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
 void
 radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipeline_cache *cache,
                                   const unsigned char *sha1, struct radv_shader_variant **variants,
-                                   struct radv_shader_binary *const *binaries)
+                                   struct radv_shader_binary *const *binaries,
+                                   const struct radv_pipeline_shader_stack_size *stack_sizes,
+                                   uint32_t num_stack_sizes)
 {
   if (!cache)
      cache = device->mem_cache;
@ -370,7 +413,7 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
      return;
   }

-   size_t size = sizeof(*entry);
+   size_t size = sizeof(*entry) + sizeof(*stack_sizes) * num_stack_sizes;
   for (int i = 0; i < MESA_SHADER_STAGES; ++i)
      if (variants[i])
         size += binaries[i]->total_size;
@ -398,6 +441,12 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
      p += binaries[i]->total_size;
   }

+   if (num_stack_sizes) {
+      memcpy(p, stack_sizes, sizeof(*stack_sizes) * num_stack_sizes);
+      p += sizeof(*stack_sizes) * num_stack_sizes;
+   }
+   entry->num_stack_sizes = num_stack_sizes;
+
   // Make valgrind happy by filling the alignment hole at the end.
   assert(p == (char *)entry + size_without_align);
   assert(sizeof(*entry) + (p - entry->code) == size_without_align);
--- a/src/amd/vulkan/radv_pipeline_rt.c
+++ b/src/amd/vulkan/radv_pipeline_rt.c
@ -22,6 +22,7 @@
 */

 #include "radv_acceleration_structure.h"
+#include "radv_debug.h"
 #include "radv_private.h"
 #include "radv_shader.h"

@ -1899,6 +1900,11 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache,
   VkResult result;
   struct radv_pipeline *pipeline = NULL;
   struct radv_pipeline_shader_stack_size *stack_sizes = NULL;
+   uint8_t hash[20];
+   nir_shader *shader = NULL;
+   bool keep_statistic_info =
+      (pCreateInfo->flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
+      (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) || device->keep_shader_info;

   if (pCreateInfo->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR)
      return radv_rt_pipeline_library_create(_device, _cache, pCreateInfo, pAllocator, pPipeline);
@ -1910,30 +1916,44 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache,
      goto fail;
   }

-   stack_sizes = calloc(sizeof(*stack_sizes), local_create_info.groupCount);
-   if (!stack_sizes) {
-      result = VK_ERROR_OUT_OF_HOST_MEMORY;
-      goto fail;
-   }
+   radv_hash_rt_shaders(hash, &local_create_info, radv_get_hash_flags(device, keep_statistic_info));
+   struct vk_shader_module module = {.base.type = VK_OBJECT_TYPE_SHADER_MODULE};

-   nir_shader *shader = create_rt_shader(device, &local_create_info, stack_sizes);
   VkComputePipelineCreateInfo compute_info = {
      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
      .pNext = NULL,
-      .flags = pCreateInfo->flags,
+      .flags = pCreateInfo->flags | VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT,
      .stage =
         {
            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
            .stage = VK_SHADER_STAGE_COMPUTE_BIT,
-            .module = vk_shader_module_handle_from_nir(shader),
+            .module = vk_shader_module_to_handle(&module),
            .pName = "main",
         },
      .layout = pCreateInfo->layout,
   };
-   result = radv_compute_pipeline_create(_device, _cache, &compute_info, pAllocator, pPipeline);
-   if (result != VK_SUCCESS)
-      goto shader_fail;

+   /* First check if we can get things from the cache before we take the expensive step of
+    * generating the nir. */
+   result = radv_compute_pipeline_create(_device, _cache, &compute_info, pAllocator, hash,
+                                         stack_sizes, local_create_info.groupCount, pPipeline);
+   if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT) {
+      stack_sizes = calloc(sizeof(*stack_sizes), local_create_info.groupCount);
+      if (!stack_sizes) {
+         result = VK_ERROR_OUT_OF_HOST_MEMORY;
+         goto fail;
+      }
+
+      shader = create_rt_shader(device, &local_create_info, stack_sizes);
+      module.nir = shader;
+      compute_info.flags = pCreateInfo->flags;
+      result = radv_compute_pipeline_create(_device, _cache, &compute_info, pAllocator, hash,
+                                            stack_sizes, local_create_info.groupCount, pPipeline);
+      stack_sizes = NULL;
+
+      if (result != VK_SUCCESS)
+         goto shader_fail;
+   }
   pipeline = radv_pipeline_from_handle(*pPipeline);

   pipeline->compute.rt_group_handles =
@ -1943,10 +1963,7 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache,
      goto shader_fail;
   }

-   pipeline->compute.rt_stack_sizes = stack_sizes;
-   stack_sizes = NULL;
-
-   pipeline->compute.dynamic_stack_size = has_dynamic_stack_size(pCreateInfo);
+   pipeline->compute.dynamic_stack_size = radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo);

   for (unsigned i = 0; i < local_create_info.groupCount; ++i) {
      const VkRayTracingShaderGroupCreateInfoKHR *group_info = &local_create_info.pGroups[i];
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@ -375,22 +375,21 @@ struct radv_pipeline_key {

 struct radv_shader_binary;
 struct radv_shader_variant;
+struct radv_pipeline_shader_stack_size;

 void radv_pipeline_cache_init(struct radv_pipeline_cache *cache, struct radv_device *device);
 void radv_pipeline_cache_finish(struct radv_pipeline_cache *cache);
 bool radv_pipeline_cache_load(struct radv_pipeline_cache *cache, const void *data, size_t size);

-bool radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
-                                                     struct radv_pipeline_cache *cache,
-                                                     const unsigned char *sha1,
-                                                     struct radv_shader_variant **variants,
-                                                     bool *found_in_application_cache);
+bool radv_create_shader_variants_from_pipeline_cache(
+   struct radv_device *device, struct radv_pipeline_cache *cache, const unsigned char *sha1,
+   struct radv_shader_variant **variants, struct radv_pipeline_shader_stack_size **stack_sizes,
+   uint32_t *num_stack_sizes, bool *found_in_application_cache);

-void radv_pipeline_cache_insert_shaders(struct radv_device *device,
-                                        struct radv_pipeline_cache *cache,
-                                        const unsigned char *sha1,
-                                        struct radv_shader_variant **variants,
-                                        struct radv_shader_binary *const *binaries);
+void radv_pipeline_cache_insert_shaders(
+   struct radv_device *device, struct radv_pipeline_cache *cache, const unsigned char *sha1,
+   struct radv_shader_variant **variants, struct radv_shader_binary *const *binaries,
+   const struct radv_pipeline_shader_stack_size *stack_sizes, uint32_t num_stack_sizes);

 enum radv_blit_ds_layout {
   RADV_BLIT_DS_LAYOUT_TILE_ENABLE,
@ -1690,6 +1689,11 @@ void radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInf
                       const struct radv_pipeline_layout *layout,
                       const struct radv_pipeline_key *key, uint32_t flags);

+void radv_hash_rt_shaders(unsigned char *hash, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                          uint32_t flags);
+
+uint32_t radv_get_hash_flags(const struct radv_device *device, bool stats);
+
 bool radv_rt_pipeline_has_dynamic_stack_size(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo);

 #define RADV_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
@ -1819,6 +1823,7 @@ struct radv_pipeline {
         struct radv_pipeline_group_handle *rt_group_handles;
         struct radv_pipeline_shader_stack_size *rt_stack_sizes;
         bool dynamic_stack_size;
+         uint32_t group_count;
      } compute;
      struct {
         unsigned stage_count;
@ -1878,7 +1883,9 @@ VkResult radv_graphics_pipeline_create(VkDevice device, VkPipelineCache cache,
 VkResult radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
                                      const VkComputePipelineCreateInfo *pCreateInfo,
                                      const VkAllocationCallbacks *pAllocator,
-                                      VkPipeline *pPipeline);
+                                      const uint8_t *custom_hash,
+                                      struct radv_pipeline_shader_stack_size *rt_stack_sizes,
+                                      uint32_t rt_group_count, VkPipeline *pPipeline);

 void radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
                           const VkAllocationCallbacks *allocator);
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -449,7 +449,7 @@ void radv_destroy_shader_slabs(struct radv_device *device);
 VkResult radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
                             struct radv_pipeline_cache *cache, const struct radv_pipeline_key *key,
                             const VkPipelineShaderStageCreateInfo **pStages,
-                             const VkPipelineCreateFlags flags,
+                             const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
                             VkPipelineCreationFeedbackEXT *pipeline_feedback,
                             VkPipelineCreationFeedbackEXT **stage_feedbacks);