ir3, tu: Add compiler flag for robust UBO behavior

This needs to be part of the compiler because it's the only piece that we always have access to in all the places ir3_optimize_loop() is called, and it's only enabled for the whole Vulkan device. Right now it's just used for constraining vectorization, but the next commit adds another use. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7573>
2020-11-10 17:59:03 +01:00 · 2020-11-10 17:59:03 +01:00 · c68ea960a7
parent 8f54028479
commit c68ea960a7
12 changed files with 37 additions and 18 deletions
--- a/src/freedreno/computerator/a6xx.c
+++ b/src/freedreno/computerator/a6xx.c
@ -490,7 +490,7 @@ a6xx_init(struct fd_device *dev, uint32_t gpu_id)
 		.read_perfcntrs = a6xx_read_perfcntrs,
 	};

-	a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id);
+	a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id, false);
 	a6xx_backend->dev = dev;

 	a6xx_backend->control_mem = fd_bo_new(dev, 0x1000,
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@ -63,7 +63,7 @@ ir3_compiler_destroy(struct ir3_compiler *compiler)
 }

 struct ir3_compiler *
-ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
+ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_access)
 {
 	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);

@ -77,6 +77,7 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)

 	compiler->dev = dev;
 	compiler->gpu_id = gpu_id;
+	compiler->robust_ubo_access = robust_ubo_access;
 	compiler->set = ir3_ra_alloc_reg_set(compiler, false);

 	/* All known GPU's have 32k local memory (aka shared) */
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@ -44,6 +44,11 @@ struct ir3_compiler {

 	struct disk_cache *disk_cache;

+	/* If true, UBO accesses are assumed to be bounds-checked as defined by
+	 * VK_EXT_robustness2 and optimizations may have to be more conservative.
+	 */
+	bool robust_ubo_access;
+
 	/*
 	 * Configuration options for things that are handled differently on
 	 * different generations:
@ -153,7 +158,8 @@ struct ir3_compiler {
 };

 void ir3_compiler_destroy(struct ir3_compiler *compiler);
-struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
+										  bool robust_ubo_access);

 void ir3_disk_cache_init(struct ir3_compiler *compiler);
 void ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
@ -190,6 +196,9 @@ enum ir3_shader_debug {
 	/* DEBUG-only options: */
 	IR3_DBG_SCHEDMSGS  = BITFIELD_BIT(20),
 	IR3_DBG_RAMSGS     = BITFIELD_BIT(21),
+
+	/* Only used for the disk-caching logic: */
+	IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
 };

 extern enum ir3_shader_debug ir3_shader_debug;
--- a/src/freedreno/ir3/ir3_disk_cache.c
+++ b/src/freedreno/ir3/ir3_disk_cache.c
@ -67,7 +67,9 @@ ir3_disk_cache_init(struct ir3_compiler *compiler)
 	char timestamp[41];
 	_mesa_sha1_format(timestamp, id_sha1);

-	const uint64_t driver_flags = ir3_shader_debug;
+	uint64_t driver_flags = ir3_shader_debug;
+	if (compiler->robust_ubo_access)
+		driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
 	compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
 }

--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -190,7 +190,7 @@ ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)

 void
-ir3_optimize_loop(nir_shader *s)
+ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
 {
 	bool progress;
 	unsigned lower_flrp =
@ -227,7 +227,7 @@ ir3_optimize_loop(nir_shader *s)
 		nir_load_store_vectorize_options vectorize_opts = {
 		   .modes = nir_var_mem_ubo,
 		   .callback = ir3_nir_should_vectorize_mem,
-		   .robust_modes = 0,
+		   .robust_modes = compiler->robust_ubo_access ? nir_var_mem_ubo : 0,
 		};
 		progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);

@ -315,7 +315,7 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
 	if (compiler->gpu_id < 500)
 		OPT_V(s, ir3_nir_lower_tg4_to_tex);

-	ir3_optimize_loop(s);
+	ir3_optimize_loop(compiler, s);

 	/* do idiv lowering after first opt loop to get a chance to propagate
 	 * constants for divide by immed power-of-two:
@ -327,7 +327,7 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
 	const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options);

 	if (idiv_progress)
-		ir3_optimize_loop(s);
+		ir3_optimize_loop(compiler, s);

 	OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);

@ -375,7 +375,7 @@ ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)
 	 */
 	OPT_V(s, ir3_nir_apply_trig_workarounds);

-	ir3_optimize_loop(s);
+	ir3_optimize_loop(compiler, s);
 }

 static bool
@ -523,14 +523,14 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
 	OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);

 	if (progress)
-		ir3_optimize_loop(s);
+		ir3_optimize_loop(so->shader->compiler, s);

 	/* Fixup indirect load_uniform's which end up with a const base offset
 	 * which is too large to encode.  Do this late(ish) so we actually
 	 * can differentiate indirect vs non-indirect.
 	 */
 	if (OPT(s, ir3_nir_fixup_load_uniform))
-		ir3_optimize_loop(s);
+		ir3_optimize_loop(so->shader->compiler, s);

 	/* Do late algebraic optimization to turn add(a, neg(b)) back into
 	* subs, then the mandatory cleanup after algebraic.  Note that it may
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@ -52,7 +52,7 @@ void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, u
 void ir3_nir_lower_gs(nir_shader *shader);

 const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
-void ir3_optimize_loop(nir_shader *s);
+void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
 void ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s);
 void ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s);
 void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
--- a/src/freedreno/ir3/tests/delay.c
+++ b/src/freedreno/ir3/tests/delay.c
@ -181,7 +181,7 @@ main(int argc, char **argv)
 	struct ir3_compiler *c;
 	int result = 0;

-	c = ir3_compiler_create(NULL, 630);
+	c = ir3_compiler_create(NULL, 630, false);

 	for (int i = 0; i < ARRAY_SIZE(tests); i++) {
 		const struct test *test = &tests[i];
--- a/src/freedreno/ir3/tests/disasm.c
+++ b/src/freedreno/ir3/tests/disasm.c
@ -393,7 +393,7 @@ main(int argc, char **argv)

 		unsigned gen = test->gpu_id / 100;
 		if (!compilers[gen]) {
-			compilers[gen] = ir3_compiler_create(NULL, test->gpu_id);
+			compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
 		}

 		FILE *fasm = fmemopen((void *)test->expected, strlen(test->expected), "r");
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@ -1078,6 +1078,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   struct tu_device *device;
   bool custom_border_colors = false;
   bool perf_query_pools = false;
+   bool robust_buffer_access2 = false;

   /* Check enabled features */
   if (pCreateInfo->pEnabledFeatures) {
@ -1110,6 +1111,11 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
         perf_query_pools = feature->performanceCounterQueryPools;
         break;
      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
+         VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
+         robust_buffer_access2 = features->robustBufferAccess2;
+         break;
+      }
      default:
         break;
      }
@ -1166,7 +1172,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
      }
   }

-   device->compiler = ir3_compiler_create(NULL, physical_device->gpu_id);
+   device->compiler = ir3_compiler_create(NULL, physical_device->gpu_id,
+                                          robust_buffer_access2);
   if (!device->compiler) {
      result = vk_startup_errorf(physical_device->instance,
                                 VK_ERROR_INITIALIZATION_FAILED,
--- a/src/freedreno/vulkan/tu_shader.c
+++ b/src/freedreno/vulkan/tu_shader.c
@ -192,7 +192,7 @@ tu_spirv_to_nir(struct tu_device *dev,

   NIR_PASS_V(nir, nir_lower_frexp);

-   ir3_optimize_loop(nir);
+   ir3_optimize_loop(dev->compiler, nir);

   return nir;
 }
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@ -362,7 +362,7 @@ main(int argc, char **argv)

   nir_shader *nir;

-   compiler = ir3_compiler_create(NULL, gpu_id);
+   compiler = ir3_compiler_create(NULL, gpu_id, false);

   if (from_tgsi) {
      struct tgsi_token toks[65536];
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@ -515,7 +515,7 @@ ir3_screen_init(struct pipe_screen *pscreen)
 {
   struct fd_screen *screen = fd_screen(pscreen);

-   screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id);
+   screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id, false);

   /* TODO do we want to limit things to # of fast cores, or just limit
    * based on total # of both big and little cores.  The little cores