radv: Support NGG culling with new perftest environment variable.

Currently we don't enable it on any chip by default, but we plan to enable it soon on GFX10.3 when we are comfortable with its performance. RADV_PERFTEST=nggc environment variable enables it on GFX10+ GPUs. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
2021-06-08 12:32:35 +02:00 · 2021-06-08 12:32:35 +02:00 · f30e4351de
parent 182d9b1e60
commit f30e4351de
8 changed files with 64 additions and 5 deletions
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@ -647,6 +647,8 @@ RADV driver environment variables
      disable optimizations that get enabled when all VRAM is CPU visible.
   ``pswave32``
      enable wave32 for pixel shaders (GFX10+)
+   ``nggc``
+      enable NGG culling on GFX10+ GPUs.
   ``rt``
      enable rt extensions whose implementation is still experimental.
   ``sam``
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@ -15,6 +15,7 @@ VK_EXT_multi_draw on ANV, lavapipe, and RADV
 VK_KHR_separate_depth_stencil_layouts on lavapipe
 VK_EXT_separate_stencil_usage on lavapipe
 VK_EXT_extended_dynamic_state2 on lavapipe
+NGG shader based primitive culling is now supported by RADV.
 Panfrost supports OpenGL ES 3.1
 New Asahi driver for the Apple M1
 GL_ARB_sample_locations on zink
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@ -74,6 +74,7 @@ enum {
   RADV_PERFTEST_NO_SAM = 1u << 6,
   RADV_PERFTEST_SAM = 1u << 7,
   RADV_PERFTEST_RT = 1u << 8,
+   RADV_PERFTEST_NGGC = 1u << 9,
 };

 bool radv_init_trace(struct radv_device *device);
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@ -830,6 +830,7 @@ static const struct debug_control radv_perftest_options[] = {{"localbos", RADV_P
                                                             {"nosam", RADV_PERFTEST_NO_SAM},
                                                             {"sam", RADV_PERFTEST_SAM},
                                                             {"rt", RADV_PERFTEST_RT},
+                                                             {"nggc", RADV_PERFTEST_NGGC},
                                                             {NULL, 0}};

 const char *
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -211,6 +211,8 @@ get_hash_flags(const struct radv_device *device, bool stats)

   if (device->instance->debug_flags & RADV_DEBUG_NO_NGG)
      hash_flags |= RADV_HASH_SHADER_NO_NGG;
+   if (device->instance->perftest_flags & RADV_PERFTEST_NGGC)
+      hash_flags |= RADV_HASH_SHADER_FORCE_NGG_CULLING;
   if (device->physical_device->cs_wave_size == 32)
      hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
   if (device->physical_device->ps_wave_size == 32)
@ -3451,8 +3453,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
         bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
         bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
                            !radv_use_llvm_for_stage(device, i);
-         if (lowered_ngg)
-            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i]);
+         if (lowered_ngg) {
+            uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
+            bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
+            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i], consider_culling);
+         }

         radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);

--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@ -1672,6 +1672,7 @@ struct radv_event {
 #define RADV_HASH_SHADER_FORCE_VRS_2x2     (1 << 9)
 #define RADV_HASH_SHADER_FORCE_VRS_2x1     (1 << 10)
 #define RADV_HASH_SHADER_FORCE_VRS_1x2     (1 << 11)
+#define RADV_HASH_SHADER_FORCE_NGG_CULLING (1 << 13)

 void radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **stages,
                       const struct radv_pipeline_layout *layout,
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -907,10 +907,44 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
   return false;
 }

+bool
+radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
+                      uint64_t ps_inputs_read)
+{
+   /* Culling doesn't make sense for meta shaders. */
+   if (!!nir->info.name)
+      return false;
+
+   /* TODO: enable by default on GFX10.3 when we're confident about performance. */
+   bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;
+
+   if (!culling_enabled)
+      return false;
+
+   /* Shader based culling efficiency can depend on PS throughput.
+    * Estimate an upper limit for PS input param count based on GPU info.
+    */
+   unsigned max_ps_params;
+   unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;
+   unsigned max_se = device->physical_device->rad_info.max_se;
+
+   if (max_render_backends < 2)
+      return false; /* Don't use NGG culling on 1 RB chips. */
+   else if (max_render_backends / max_se == 4)
+      max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */
+   else
+      max_ps_params = 4; /* Navi 1x. */
+
+   /* TODO: consider other heuristics here, such as PS execution time */
+
+   return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
+}
+
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                    struct radv_shader_info *info,
                    const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key)
+                    struct radv_shader_variant_key *key,
+                    bool consider_culling)
 {
   /* TODO: support the LLVM backend with the NIR lowering */
   assert(!radv_use_llvm_for_stage(device, nir->info.stage));
@ -930,9 +964,19 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
         num_vertices_per_prim = 1;
      else if (nir->info.tess.primitive_mode == GL_ISOLINES)
         num_vertices_per_prim = 2;
+
+      /* Manually mark the primitive ID used, so the shader can repack it. */
+      if (key->vs_common_out.export_prim_id)
+         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
   } else if (nir->info.stage == MESA_SHADER_VERTEX) {
      /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
      num_vertices_per_prim = key->vs.outprim + 1;
+
+      /* Manually mark the instance ID used, so the shader can repack it. */
+      if (key->vs.instance_rate_inputs)
+         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
      num_vertices_per_prim = nir->info.gs.vertices_in;
   } else {
@ -964,7 +1008,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
            num_vertices_per_prim,
            max_workgroup_size,
            info->wave_size,
-            false,
+            consider_culling,
            key->vs_common_out.as_ngg_passthrough,
            key->vs_common_out.export_prim_id,
            key->vs.provoking_vtx_last);
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -569,6 +569,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                    struct radv_shader_info *info,
                    const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key);
+                    struct radv_shader_variant_key *key,
+                    bool consider_culling);
+
+bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
+                           uint64_t ps_inputs_read);

 #endif