anv: Compile ray-tracing shaders

This doesn't look too different from other compile functions we have in anv_pipeline.c. The primary difference is that ray-tracing pipelines have this weird two-stage thing where you have "stages" which are individual shaders and "groups" which are sort of mini pipelines that are used to handle hits. For any given ray intersection, only the hit and intersection shaders from the same group get used together. You can't have an intersection shader from group A used with an any-hit from group B. This results in a weird two-step compile. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8637>
2021-01-21 15:28:28 -06:00 · 2021-01-21 15:28:28 -06:00 · e104555851
parent 379b9bb7b0
commit e104555851
2 changed files with 327 additions and 11 deletions
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -34,6 +34,7 @@
 #include "common/intel_sample_positions.h"
 #include "anv_private.h"
 #include "compiler/brw_nir.h"
+#include "compiler/brw_nir_rt.h"
 #include "anv_nir.h"
 #include "nir/nir_xfb_info.h"
 #include "spirv/nir_spirv.h"
@ -154,6 +155,7 @@ anv_shader_compile_to_nir(struct anv_device *device,
         .post_depth_coverage = pdevice->info.ver >= 9,
         .runtime_descriptor_array = true,
         .float_controls = pdevice->info.ver >= 8,
+         .ray_tracing = pdevice->info.has_ray_tracing,
         .shader_clock = true,
         .shader_viewport_index_layer = true,
         .stencil_export = pdevice->info.ver >= 9,
@ -245,7 +247,8 @@ anv_shader_compile_to_nir(struct anv_device *device,
   NIR_PASS_V(nir, nir_split_per_member_structs);

   NIR_PASS_V(nir, nir_remove_dead_variables,
-              nir_var_shader_in | nir_var_shader_out | nir_var_system_value,
+              nir_var_shader_in | nir_var_shader_out | nir_var_system_value |
+              nir_var_shader_call_data | nir_var_ray_hit_attrib,
              NULL);

   NIR_PASS_V(nir, nir_propagate_invariant, false);
@ -603,6 +606,17 @@ populate_cs_prog_key(const struct intel_device_info *devinfo,
   }
 }

+static void
+populate_bs_prog_key(const struct intel_device_info *devinfo,
+                     VkPipelineShaderStageCreateFlags flags,
+                     bool robust_buffer_access,
+                     struct brw_bs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(devinfo, flags, robust_buffer_access, &key->base);
+}
+
 struct anv_pipeline_stage {
   gl_shader_stage stage;

@ -634,6 +648,8 @@ struct anv_pipeline_stage {
   VkPipelineCreationFeedbackEXT feedback;

   const unsigned *code;
+
+   struct anv_shader_bin *bin;
 };

 static void
@ -2404,6 +2420,265 @@ anv_graphics_pipeline_init(struct anv_graphics_pipeline *pipeline,
   return VK_SUCCESS;
 }

+static VkResult
+compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
+                         nir_shader *nir,
+                         struct anv_pipeline_stage *stage,
+                         struct anv_shader_bin **shader_out,
+                         void *mem_ctx)
+{
+   const struct brw_compiler *compiler =
+      pipeline->base.device->physical->compiler;
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   /* Use a timestamp as the key to ensure we never get a cache hit */
+   int64_t time_ns = os_time_get_nano() ^ (uintptr_t)nir;
+   uint8_t key[20];
+   _mesa_sha1_compute(&time_ns, sizeof(time_ns), key);
+
+   nir_shader **resume_shaders = NULL;
+   uint32_t num_resume_shaders = 0;
+   if (nir->info.stage != MESA_SHADER_COMPUTE) {
+      NIR_PASS_V(nir, brw_nir_lower_shader_calls,
+                 &resume_shaders, &num_resume_shaders, mem_ctx);
+   }
+
+   NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
+   for (unsigned i = 0; i < num_resume_shaders; i++)
+      NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo);
+
+   struct brw_bs_prog_data prog_data = { };
+   struct brw_compile_stats stats = { };
+   const unsigned *code =
+      brw_compile_bs(compiler, pipeline->base.device, mem_ctx,
+                     &stage->key.bs, &prog_data, nir,
+                     num_resume_shaders, resume_shaders, &stats, NULL);
+   if (code == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Ray-tracing shaders don't have a "real" bind map */
+   struct anv_pipeline_bind_map empty_bind_map = {};
+
+   const unsigned code_size = prog_data.base.program_size;
+   struct anv_shader_bin *bin =
+      anv_device_upload_kernel(pipeline->base.device,
+                               NULL, /* TODO: Caching is disabled for now */
+                               stage->stage,
+                               key, sizeof(key),
+                               code, code_size,
+                               &prog_data.base, sizeof(prog_data),
+                               &stats, 1,
+                               NULL, &empty_bind_map);
+   if (bin == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* TODO: Figure out executables for resume shaders */
+   anv_pipeline_add_executables(&pipeline->base, stage, bin);
+   util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, bin);
+
+   *shader_out = bin;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
+                                 struct anv_pipeline_cache *cache,
+                                 const VkRayTracingPipelineCreateInfoKHR *info)
+{
+   const struct intel_device_info *devinfo = &pipeline->base.device->info;
+   VkResult result;
+
+   VkPipelineCreationFeedbackEXT pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
+   void *pipeline_ctx = ralloc_context(NULL);
+
+   struct anv_pipeline_stage *stages =
+      rzalloc_array(pipeline_ctx, struct anv_pipeline_stage, info->stageCount);
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i];
+      if (sinfo->module == VK_NULL_HANDLE)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      stages[i] = (struct anv_pipeline_stage) {
+         .stage = vk_to_mesa_shader_stage(sinfo->stage),
+         .module = vk_shader_module_from_handle(sinfo->module),
+         .entrypoint = sinfo->pName,
+         .spec_info = sinfo->pSpecializationInfo,
+         .cache_key = {
+            .stage = vk_to_mesa_shader_stage(sinfo->stage),
+         },
+         .feedback = {
+            .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+         },
+      };
+
+      anv_pipeline_hash_shader(stages[i].module,
+                               stages[i].entrypoint,
+                               stages[i].stage,
+                               stages[i].spec_info,
+                               stages[i].shader_sha1);
+
+      populate_bs_prog_key(&pipeline->base.device->info, sinfo->flags,
+                           pipeline->base.device->robust_buffer_access,
+                           &stages[i].key.bs);
+
+      stages[i].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
+                                                 pipeline_ctx, &stages[i]);
+      if (stages[i].nir == NULL) {
+         ralloc_free(pipeline_ctx);
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      anv_pipeline_lower_nir(&pipeline->base, pipeline_ctx, &stages[i], layout);
+
+      stages[i].feedback.duration += os_time_get_nano() - stage_start;
+      stages[i].feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+      if (false /* cache_hit */) {
+         stages[i].feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+      }
+   }
+
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      if (!stages[i].entrypoint)
+         continue;
+
+      /* We handle intersection shaders as part of the group */
+      if (stages[i].stage == MESA_SHADER_INTERSECTION)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      void *stage_ctx = ralloc_context(pipeline_ctx);
+
+      nir_shader *nir = nir_shader_clone(stage_ctx, stages[i].nir);
+      switch (stages[i].stage) {
+      case MESA_SHADER_RAYGEN:
+         brw_nir_lower_raygen(nir);
+         break;
+
+      case MESA_SHADER_ANY_HIT:
+         brw_nir_lower_any_hit(nir, devinfo);
+         break;
+
+      case MESA_SHADER_CLOSEST_HIT:
+         brw_nir_lower_closest_hit(nir);
+         break;
+
+      case MESA_SHADER_MISS:
+         brw_nir_lower_miss(nir);
+         break;
+
+      case MESA_SHADER_INTERSECTION:
+         unreachable("These are handled later");
+
+      case MESA_SHADER_CALLABLE:
+         brw_nir_lower_callable(nir);
+         break;
+
+      default:
+         unreachable("Invalid ray-tracing shader stage");
+      }
+
+      result = compile_upload_rt_shader(pipeline, nir, &stages[i],
+                                        &stages[i].bin, stage_ctx);
+      if (result != VK_SUCCESS) {
+         ralloc_free(pipeline_ctx);
+         return result;
+      }
+
+      ralloc_free(stage_ctx);
+
+      stages[i].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   for (uint32_t i = 0; i < info->groupCount; i++) {
+      const VkRayTracingShaderGroupCreateInfoKHR *ginfo = &info->pGroups[i];
+      struct anv_rt_shader_group *group = &pipeline->groups[i];
+      group->type = ginfo->type;
+      switch (ginfo->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+         assert(ginfo->generalShader < info->stageCount);
+         group->general = stages[ginfo->generalShader].bin;
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         if (ginfo->anyHitShader < info->stageCount)
+            group->any_hit = stages[ginfo->anyHitShader].bin;
+
+         if (ginfo->closestHitShader < info->stageCount)
+            group->closest_hit = stages[ginfo->closestHitShader].bin;
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
+         if (ginfo->closestHitShader < info->stageCount)
+            group->closest_hit = stages[ginfo->closestHitShader].bin;
+
+         /* The any-hit and intersection shader have to be combined */
+         void *group_ctx = ralloc_context(pipeline_ctx);
+
+         uint32_t intersection_idx = info->pGroups[i].intersectionShader;
+         assert(intersection_idx < info->stageCount);
+         nir_shader *intersection =
+            nir_shader_clone(group_ctx, stages[intersection_idx].nir);
+
+         uint32_t any_hit_idx = info->pGroups[i].anyHitShader;
+         const nir_shader *any_hit = NULL;
+         if (any_hit_idx < info->stageCount)
+            any_hit = stages[any_hit_idx].nir;
+
+         brw_nir_lower_combined_intersection_any_hit(intersection, any_hit,
+                                                     devinfo);
+
+         result = compile_upload_rt_shader(pipeline, intersection,
+                                           &stages[intersection_idx],
+                                           &group->intersection,
+                                           group_ctx);
+         if (result != VK_SUCCESS) {
+            ralloc_free(pipeline_ctx);
+            return result;
+         }
+
+         ralloc_free(group_ctx);
+         break;
+      }
+
+      default:
+         unreachable("Invalid ray tracing shader group type");
+      }
+   }
+
+   ralloc_free(pipeline_ctx);
+
+   if (false /* cache_hit */) {
+      pipeline_feedback.flags |=
+         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+   }
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+
+   const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
+      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+   if (create_feedback) {
+      *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
+
+      assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount);
+      for (uint32_t i = 0; i < info->stageCount; i++) {
+         gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage);
+         create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
 VkResult
 anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline,
                              struct anv_device *device,
@ -2411,7 +2686,28 @@ anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline,
                              const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
                              const VkAllocationCallbacks *alloc)
 {
+   VkResult result;
+
+   /* Zero things out so our clean-up works */
+   memset(pipeline->groups, 0,
+          pipeline->group_count * sizeof(*pipeline->groups));
+
+   util_dynarray_init(&pipeline->shaders, pipeline->base.mem_ctx);
+
+   result = anv_pipeline_compile_ray_tracing(pipeline, cache, pCreateInfo);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   anv_pipeline_setup_l3_config(&pipeline->base, /* needs_slm */ false);
+
   return VK_SUCCESS;
+
+fail:
+   util_dynarray_foreach(&pipeline->shaders,
+                         struct anv_shader_bin *, shader) {
+      anv_shader_bin_unref(device, *shader);
+   }
+   return result;
 }

 #define WRITE_STR(field, ...) ({                               \
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@ -82,18 +82,38 @@ anv_shader_bin_create(struct anv_device *device,
                               shader->kernel.offset +
                               prog_data_in->const_data_offset;

-   struct brw_shader_reloc_value reloc_values[] = {
-      {
-         .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
-         .value = shader_data_addr,
-      },
-      {
-         .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
-         .value = shader_data_addr >> 32,
-      },
+   int rv_count = 0;
+   struct brw_shader_reloc_value reloc_values[5];
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
+      .value = shader_data_addr,
   };
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
+      .value = shader_data_addr >> 32,
+   };
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_SHADER_START_OFFSET,
+      .value = shader->kernel.offset,
+   };
+   if (brw_shader_stage_is_bindless(stage)) {
+      const struct brw_bs_prog_data *bs_prog_data =
+         brw_bs_prog_data_const(prog_data_in);
+      uint64_t resume_sbt_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS +
+                                 shader->kernel.offset +
+                                 bs_prog_data->resume_sbt_offset;
+      reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+         .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW,
+         .value = resume_sbt_addr,
+      };
+      reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+         .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH,
+         .value = resume_sbt_addr >> 32,
+      };
+   }
+
   brw_write_shader_relocs(&device->info, shader->kernel.map, prog_data_in,
-                           reloc_values, ARRAY_SIZE(reloc_values));
+                           reloc_values, rv_count);

   memcpy(prog_data, prog_data_in, prog_data_size);
   typed_memcpy(prog_data_relocs, prog_data_in->relocs,