intel,anv,iris,crocus: Drop subgroup size from the shader key

Use nir->info.subgroup_size instead. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17337>
2022-07-07 14:39:19 -05:00 · 2022-07-07 14:39:19 -05:00 · 530de844ef
parent e9b2862c1a
commit 530de844ef
10 changed files with 58 additions and 153 deletions
--- a/src/gallium/drivers/crocus/crocus_program.c
+++ b/src/gallium/drivers/crocus/crocus_program.c
@ -49,7 +49,6 @@
 #include "nir/tgsi_to_nir.h"

 #define KEY_INIT_NO_ID()                              \
-   .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \
   .base.tex.swizzles[0 ... BRW_MAX_SAMPLERS - 1] = 0x688,   \
   .base.tex.compressed_multisample_layout_mask = ~0
 #define KEY_INIT()                                                        \
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@ -54,7 +54,6 @@
 #define BRW_KEY_INIT(gen, prog_id, limit_trig_input)       \
   .base.program_string_id = prog_id,                      \
   .base.limit_trig_input_range = limit_trig_input,        \
-   .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM,   \
   .base.tex.swizzles[0 ... BRW_MAX_SAMPLERS - 1] = 0x688, \
   .base.tex.compressed_multisample_layout_mask = ~0,      \
   .base.tex.msaa_16 = (gen >= 9 ? ~0 : 0)
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -230,26 +230,9 @@ struct brw_sampler_prog_key_data {
   float scale_factors[BRW_MAX_SAMPLERS];
 };

-/** An enum representing what kind of input gl_SubgroupSize is. */
-enum PACKED brw_subgroup_size_type
-{
-   BRW_SUBGROUP_SIZE_API_CONSTANT,     /**< Default Vulkan behavior */
-   BRW_SUBGROUP_SIZE_UNIFORM,          /**< OpenGL behavior */
-   BRW_SUBGROUP_SIZE_VARYING,          /**< VK_EXT_subgroup_size_control */
-
-   /* These enums are specifically chosen so that the value of the enum is
-    * also the subgroup size.  If any new values are added, they must respect
-    * this invariant.
-    */
-   BRW_SUBGROUP_SIZE_REQUIRE_8   = 8,  /**< VK_EXT_subgroup_size_control */
-   BRW_SUBGROUP_SIZE_REQUIRE_16  = 16, /**< VK_EXT_subgroup_size_control */
-   BRW_SUBGROUP_SIZE_REQUIRE_32  = 32, /**< VK_EXT_subgroup_size_control */
-};
-
 struct brw_base_prog_key {
   unsigned program_string_id;

-   enum brw_subgroup_size_type subgroup_size_type;
   bool robust_buffer_access;

   /**
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -7759,7 +7759,7 @@ brw_compile_cs(const struct brw_compiler *compiler,
   }

   const unsigned required_dispatch_width =
-      brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type);
+      brw_required_dispatch_width(&nir->info);

   fs_visitor *v[3]     = {0};
   const char *error[3] = {0};
--- a/src/intel/compiler/brw_kernel.c
+++ b/src/intel/compiler/brw_kernel.c
@ -439,9 +439,7 @@ brw_kernel_from_spirv(struct brw_compiler *compiler,
   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
   NIR_PASS_V(nir, lower_kernel_intrinsics);

-   struct brw_cs_prog_key key = {
-      .base.subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING,
-   };
+   struct brw_cs_prog_key key = { };

   memset(&kernel->prog_data, 0, sizeof(kernel->prog_data));
   kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4);
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@ -217,7 +217,7 @@ brw_compile_task(const struct brw_compiler *compiler,
   NIR_PASS_V(nir, brw_nir_lower_tue_outputs, &prog_data->map);

   const unsigned required_dispatch_width =
-      brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type);
+      brw_required_dispatch_width(&nir->info);

   fs_visitor *v[3]     = {0};
   const char *error[3] = {0};
@ -715,7 +715,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
   NIR_PASS_V(nir, brw_nir_lower_mue_outputs, &prog_data->map);

   const unsigned required_dispatch_width =
-      brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type);
+      brw_required_dispatch_width(&nir->info);

   fs_visitor *v[3]     = {0};
   const char *error[3] = {0};
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@ -1353,16 +1353,14 @@ brw_nir_apply_sampler_key(nir_shader *nir,
 }

 static unsigned
-get_subgroup_size(gl_shader_stage stage,
-                  const struct brw_base_prog_key *key,
-                  unsigned max_subgroup_size)
+get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
 {
-   switch (key->subgroup_size_type) {
-   case BRW_SUBGROUP_SIZE_API_CONSTANT:
+   switch (info->subgroup_size) {
+   case SUBGROUP_SIZE_API_CONSTANT:
      /* We have to use the global constant size. */
      return BRW_SUBGROUP_SIZE;

-   case BRW_SUBGROUP_SIZE_UNIFORM:
+   case SUBGROUP_SIZE_UNIFORM:
      /* It has to be uniform across all invocations but can vary per stage
       * if we want.  This gives us a bit more freedom.
       *
@ -1373,7 +1371,7 @@ get_subgroup_size(gl_shader_stage stage,
       */
      return max_subgroup_size;

-   case BRW_SUBGROUP_SIZE_VARYING:
+   case SUBGROUP_SIZE_VARYING:
      /* The subgroup size is allowed to be fully varying.  For geometry
       * stages, we know it's always 8 which is max_subgroup_size so we can
       * return that.  For compute, brw_nir_apply_key is called once per
@ -1384,16 +1382,21 @@ get_subgroup_size(gl_shader_stage stage,
       * that's a risk the client took when it asked for a varying subgroup
       * size.
       */
-      return stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
+      return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;

-   case BRW_SUBGROUP_SIZE_REQUIRE_8:
-   case BRW_SUBGROUP_SIZE_REQUIRE_16:
-   case BRW_SUBGROUP_SIZE_REQUIRE_32:
-      assert(gl_shader_stage_uses_workgroup(stage));
+   case SUBGROUP_SIZE_REQUIRE_8:
+   case SUBGROUP_SIZE_REQUIRE_16:
+   case SUBGROUP_SIZE_REQUIRE_32:
+      assert(gl_shader_stage_uses_workgroup(info->stage));
      /* These enum values are expressly chosen to be equal to the subgroup
       * size that they require.
       */
-      return key->subgroup_size_type;
+      return info->subgroup_size;
+
+   case SUBGROUP_SIZE_FULL_SUBGROUPS:
+   case SUBGROUP_SIZE_REQUIRE_64:
+   case SUBGROUP_SIZE_REQUIRE_128:
+      break;
   }

   unreachable("Invalid subgroup size type");
@ -1411,8 +1414,7 @@ brw_nir_apply_key(nir_shader *nir,
   OPT(brw_nir_apply_sampler_key, compiler, &key->tex);

   const nir_lower_subgroups_options subgroups_options = {
-      .subgroup_size = get_subgroup_size(nir->info.stage, key,
-                                         max_subgroup_size),
+      .subgroup_size = get_subgroup_size(&nir->info, max_subgroup_size),
      .ballot_bit_size = 32,
      .ballot_components = 1,
      .lower_subgroup_masks = true,
--- a/src/intel/compiler/brw_private.h
+++ b/src/intel/compiler/brw_private.h
@ -31,8 +31,7 @@
 extern "C" {
 #endif

-unsigned brw_required_dispatch_width(const struct shader_info *info,
-                                     enum brw_subgroup_size_type subgroup_size_type);
+unsigned brw_required_dispatch_width(const struct shader_info *info);

 bool brw_simd_should_compile(void *mem_ctx,
                             unsigned simd,
--- a/src/intel/compiler/brw_simd_selection.c
+++ b/src/intel/compiler/brw_simd_selection.c
@ -28,26 +28,17 @@
 #include "util/ralloc.h"

 unsigned
-brw_required_dispatch_width(const struct shader_info *info,
-                            enum brw_subgroup_size_type subgroup_size_type)
+brw_required_dispatch_width(const struct shader_info *info)
 {
-   unsigned required = 0;
-
-   if ((int)subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
+   if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
      assert(gl_shader_stage_uses_workgroup(info->stage));
      /* These enum values are expressly chosen to be equal to the subgroup
       * size that they require.
       */
-      required = (unsigned)subgroup_size_type;
+      return (unsigned)info->subgroup_size;
+   } else {
+      return 0;
   }
-
-   if (gl_shader_stage_is_compute(info->stage) &&
-       info->subgroup_size >= SUBGROUP_SIZE_REQUIRE_8) {
-      assert(required == 0 || required == info->subgroup_size);
-      required = info->subgroup_size;
-   }
-
-   return required;
 }

 static inline bool
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -315,11 +315,9 @@ populate_sampler_prog_key(const struct intel_device_info *devinfo,

 static void
 populate_base_prog_key(const struct anv_device *device,
-                       enum brw_subgroup_size_type subgroup_size_type,
                       bool robust_buffer_acccess,
                       struct brw_base_prog_key *key)
 {
-   key->subgroup_size_type = subgroup_size_type;
   key->robust_buffer_access = robust_buffer_acccess;
   key->limit_trig_input_range =
      device->physical->instance->limit_trig_input_range;
@ -329,14 +327,12 @@ populate_base_prog_key(const struct anv_device *device,

 static void
 populate_vs_prog_key(const struct anv_device *device,
-                     enum brw_subgroup_size_type subgroup_size_type,
                     bool robust_buffer_acccess,
                     struct brw_vs_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, subgroup_size_type,
-                          robust_buffer_acccess, &key->base);
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);

   /* XXX: Handle vertex input work-arounds */

@ -345,41 +341,35 @@ populate_vs_prog_key(const struct anv_device *device,

 static void
 populate_tcs_prog_key(const struct anv_device *device,
-                      enum brw_subgroup_size_type subgroup_size_type,
                      bool robust_buffer_acccess,
                      unsigned input_vertices,
                      struct brw_tcs_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, subgroup_size_type,
-                          robust_buffer_acccess, &key->base);
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);

   key->input_vertices = input_vertices;
 }

 static void
 populate_tes_prog_key(const struct anv_device *device,
-                      enum brw_subgroup_size_type subgroup_size_type,
                      bool robust_buffer_acccess,
                      struct brw_tes_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, subgroup_size_type,
-                          robust_buffer_acccess, &key->base);
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
 }

 static void
 populate_gs_prog_key(const struct anv_device *device,
-                     enum brw_subgroup_size_type subgroup_size_type,
                     bool robust_buffer_acccess,
                     struct brw_gs_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, subgroup_size_type,
-                          robust_buffer_acccess, &key->base);
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
 }

 static bool
@ -439,29 +429,26 @@ pipeline_has_coarse_pixel(const struct anv_graphics_pipeline *pipeline,

 static void
 populate_task_prog_key(const struct anv_device *device,
-                       enum brw_subgroup_size_type subgroup_size_type,
                       bool robust_buffer_access,
                       struct brw_task_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, subgroup_size_type, robust_buffer_access, &key->base);
+   populate_base_prog_key(device, robust_buffer_access, &key->base);
 }

 static void
 populate_mesh_prog_key(const struct anv_device *device,
-                       enum brw_subgroup_size_type subgroup_size_type,
                       bool robust_buffer_access,
                       struct brw_mesh_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, subgroup_size_type, robust_buffer_access, &key->base);
+   populate_base_prog_key(device, robust_buffer_access, &key->base);
 }

 static void
 populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline,
-                     VkPipelineShaderStageCreateFlags flags,
                     bool robust_buffer_acccess,
                     const VkPipelineMultisampleStateCreateInfo *ms_info,
                     const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_info,
@ -472,7 +459,7 @@ populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline,

   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, flags, robust_buffer_acccess, &key->base);
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);

   /* We set this to 0 here and set to the actual value before we call
    * brw_compile_fs.
@ -520,25 +507,22 @@ populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline,

 static void
 populate_cs_prog_key(const struct anv_device *device,
-                     enum brw_subgroup_size_type subgroup_size_type,
                     bool robust_buffer_acccess,
                     struct brw_cs_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, subgroup_size_type,
-                          robust_buffer_acccess, &key->base);
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
 }

 static void
 populate_bs_prog_key(const struct anv_device *device,
-                     VkPipelineShaderStageCreateFlags flags,
                     bool robust_buffer_access,
                     struct brw_bs_prog_key *key)
 {
   memset(key, 0, sizeof(*key));

-   populate_base_prog_key(device, flags, robust_buffer_access, &key->base);
+   populate_base_prog_key(device, robust_buffer_access, &key->base);
 }

 struct anv_pipeline_stage {
@ -1323,45 +1307,6 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline,
   pipeline->ray_queries = MAX2(pipeline->ray_queries, bin->prog_data->ray_queries);
 }

-static enum brw_subgroup_size_type
-anv_subgroup_size_type(gl_shader_stage stage,
-                       const struct vk_shader_module *module,
-                       VkPipelineShaderStageCreateFlags flags,
-                       const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info)
-{
-   enum brw_subgroup_size_type subgroup_size_type;
-
-   const bool allow_varying =
-      flags & VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT ||
-      vk_shader_module_spirv_version(module) >= 0x10600;
-
-   if (rss_info) {
-      assert(gl_shader_stage_uses_workgroup(stage));
-      /* These enum values are expressly chosen to be equal to the subgroup
-       * size that they require.
-       */
-      assert(rss_info->requiredSubgroupSize == 8 ||
-             rss_info->requiredSubgroupSize == 16 ||
-             rss_info->requiredSubgroupSize == 32);
-      subgroup_size_type = rss_info->requiredSubgroupSize;
-   } else if (allow_varying) {
-      subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING;
-   } else if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) {
-      assert(stage == MESA_SHADER_COMPUTE);
-      /* If the client expressly requests full subgroups and they don't
-       * specify a subgroup size neither allow varying subgroups, we need to
-       * pick one.  So we specify the API value of 32.  Performance will
-       * likely be terrible in this case but there's nothing we can do about
-       * that.  The client should have chosen a size.
-       */
-      subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_32;
-   } else {
-      subgroup_size_type = BRW_SUBGROUP_SIZE_API_CONSTANT;
-   }
-
-   return subgroup_size_type;
-}
-
 static void
 anv_pipeline_init_from_cached_graphics(struct anv_graphics_pipeline *pipeline)
 {
@ -1404,7 +1349,6 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline,
   VkResult result;
   for (uint32_t i = 0; i < info->stageCount; i++) {
      const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i];
-      VK_FROM_HANDLE(vk_shader_module, module, sinfo->module);
      gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);

      int64_t stage_start = os_time_get_nano();
@ -1413,33 +1357,26 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline,
      stages[stage].info = sinfo;
      vk_pipeline_hash_shader_stage(&info->pStages[i], stages[stage].shader_sha1);

-      const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info =
-         vk_find_struct_const(sinfo->pNext,
-                              PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT);
-
-      enum brw_subgroup_size_type subgroup_size_type =
-         anv_subgroup_size_type(stage, module, sinfo->flags, rss_info);
-
      const struct anv_device *device = pipeline->base.device;
      switch (stage) {
      case MESA_SHADER_VERTEX:
-         populate_vs_prog_key(device, subgroup_size_type,
+         populate_vs_prog_key(device,
                              pipeline->base.device->robust_buffer_access,
                              &stages[stage].key.vs);
         break;
      case MESA_SHADER_TESS_CTRL:
-         populate_tcs_prog_key(device, subgroup_size_type,
+         populate_tcs_prog_key(device,
                               pipeline->base.device->robust_buffer_access,
                               info->pTessellationState->patchControlPoints,
                               &stages[stage].key.tcs);
         break;
      case MESA_SHADER_TESS_EVAL:
-         populate_tes_prog_key(device, subgroup_size_type,
+         populate_tes_prog_key(device,
                               pipeline->base.device->robust_buffer_access,
                               &stages[stage].key.tes);
         break;
      case MESA_SHADER_GEOMETRY:
-         populate_gs_prog_key(device, subgroup_size_type,
+         populate_gs_prog_key(device,
                              pipeline->base.device->robust_buffer_access,
                              &stages[stage].key.gs);
         break;
@ -1447,7 +1384,7 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline,
         const bool raster_enabled =
            !info->pRasterizationState->rasterizerDiscardEnable ||
            dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
-         populate_wm_prog_key(pipeline, subgroup_size_type,
+         populate_wm_prog_key(pipeline,
                              pipeline->base.device->robust_buffer_access,
                              raster_enabled ? info->pMultisampleState : NULL,
                              vk_find_struct_const(info->pNext,
@ -1457,12 +1394,12 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline,
         break;
      }
      case MESA_SHADER_TASK:
-         populate_task_prog_key(device, subgroup_size_type,
+         populate_task_prog_key(device,
                                pipeline->base.device->robust_buffer_access,
                                &stages[stage].key.task);
         break;
      case MESA_SHADER_MESH:
-         populate_mesh_prog_key(device, subgroup_size_type,
+         populate_mesh_prog_key(device,
                                pipeline->base.device->robust_buffer_access,
                                &stages[stage].key.mesh);
         break;
@ -1849,7 +1786,6 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
                        const VkComputePipelineCreateInfo *info)
 {
   const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
-   VK_FROM_HANDLE(vk_shader_module, module, sinfo->module);
   assert(sinfo->stage == VK_SHADER_STAGE_COMPUTE_BIT);

   VkPipelineCreationFeedbackEXT pipeline_feedback = {
@ -1874,16 +1810,7 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,

   struct anv_shader_bin *bin = NULL;

-   const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *rss_info =
-      vk_find_struct_const(info->stage.pNext,
-                           PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
-
-   const enum brw_subgroup_size_type subgroup_size_type =
-      anv_subgroup_size_type(MESA_SHADER_COMPUTE, module, info->stage.flags, rss_info);
-
-   populate_cs_prog_key(device, subgroup_size_type,
-                        device->robust_buffer_access,
-                        &stage.key.cs);
+   populate_cs_prog_key(device, device->robust_buffer_access, &stage.key.cs);

   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);

@ -1939,10 +1866,19 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
       */
      if (device->physical->instance->assume_full_subgroups &&
          stage.nir->info.cs.uses_wide_subgroup_intrinsics &&
-          subgroup_size_type == BRW_SUBGROUP_SIZE_API_CONSTANT &&
+          stage.nir->info.subgroup_size == SUBGROUP_SIZE_API_CONSTANT &&
          local_size &&
          local_size % BRW_SUBGROUP_SIZE == 0)
-         stage.key.base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_32;
+         stage.nir->info.subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
+
+      /* If the client requests that we dispatch full subgroups but doesn't
+       * allow us to pick a subgroup size, we have to smash it to the API
+       * value of 32.  Performance will likely be terrible in this case but
+       * there's nothing we can do about that.  The client should have chosen
+       * a size.
+       */
+      if (stage.nir->info.subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS)
+         stage.nir->info.subgroup_size = BRW_SUBGROUP_SIZE;

      stage.num_stats = 1;

@ -2693,7 +2629,7 @@ anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
         },
      };

-      populate_bs_prog_key(pipeline->base.device, sinfo->flags,
+      populate_bs_prog_key(pipeline->base.device,
                           pipeline->base.device->robust_buffer_access,
                           &stages[i].key.bs);

@ -2997,10 +2933,6 @@ anv_device_init_rt_shaders(struct anv_device *device)
      struct brw_cs_prog_key key;
   } trampoline_key = {
      .name = "rt-trampoline",
-      .key = {
-         /* TODO: Other subgroup sizes? */
-         .base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_8,
-      },
   };
   device->rt_trampoline =
      anv_device_search_for_kernel(device, device->internal_cache,
@ -3012,6 +2944,8 @@ anv_device_init_rt_shaders(struct anv_device *device)
      nir_shader *trampoline_nir =
         brw_nir_create_raygen_trampoline(device->physical->compiler, tmp_ctx);

+      trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_8;
+
      struct anv_pipeline_bind_map bind_map = {
         .surface_count = 0,
         .sampler_count = 0,