ir3, tu: Add compiler flag for robust UBO behavior

This needs to be part of the compiler because it's the only piece that
we always have access to in all the places ir3_optimize_loop() is
called, and it's only enabled for the whole Vulkan device. Right now
it's just used for constraining vectorization, but the next commit adds
another use.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7573>
This commit is contained in:
Connor Abbott 2020-11-10 17:59:03 +01:00
parent 8f54028479
commit c68ea960a7
12 changed files with 37 additions and 18 deletions

View File

@ -490,7 +490,7 @@ a6xx_init(struct fd_device *dev, uint32_t gpu_id)
.read_perfcntrs = a6xx_read_perfcntrs,
};
a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id);
a6xx_backend->compiler = ir3_compiler_create(dev, gpu_id, false);
a6xx_backend->dev = dev;
a6xx_backend->control_mem = fd_bo_new(dev, 0x1000,

View File

@ -63,7 +63,7 @@ ir3_compiler_destroy(struct ir3_compiler *compiler)
}
struct ir3_compiler *
ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_access)
{
struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
@ -77,6 +77,7 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
compiler->dev = dev;
compiler->gpu_id = gpu_id;
compiler->robust_ubo_access = robust_ubo_access;
compiler->set = ir3_ra_alloc_reg_set(compiler, false);
/* All known GPU's have 32k local memory (aka shared) */

View File

@ -44,6 +44,11 @@ struct ir3_compiler {
struct disk_cache *disk_cache;
/* If true, UBO accesses are assumed to be bounds-checked as defined by
* VK_EXT_robustness2 and optimizations may have to be more conservative.
*/
bool robust_ubo_access;
/*
* Configuration options for things that are handled differently on
* different generations:
@ -153,7 +158,8 @@ struct ir3_compiler {
};
void ir3_compiler_destroy(struct ir3_compiler *compiler);
struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
bool robust_ubo_access);
void ir3_disk_cache_init(struct ir3_compiler *compiler);
void ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
@ -190,6 +196,9 @@ enum ir3_shader_debug {
/* DEBUG-only options: */
IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
IR3_DBG_RAMSGS = BITFIELD_BIT(21),
/* Only used for the disk-caching logic: */
IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
};
extern enum ir3_shader_debug ir3_shader_debug;

View File

@ -67,7 +67,9 @@ ir3_disk_cache_init(struct ir3_compiler *compiler)
char timestamp[41];
_mesa_sha1_format(timestamp, id_sha1);
const uint64_t driver_flags = ir3_shader_debug;
uint64_t driver_flags = ir3_shader_debug;
if (compiler->robust_ubo_access)
driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
}

View File

@ -190,7 +190,7 @@ ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
void
ir3_optimize_loop(nir_shader *s)
ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
{
bool progress;
unsigned lower_flrp =
@ -227,7 +227,7 @@ ir3_optimize_loop(nir_shader *s)
nir_load_store_vectorize_options vectorize_opts = {
.modes = nir_var_mem_ubo,
.callback = ir3_nir_should_vectorize_mem,
.robust_modes = 0,
.robust_modes = compiler->robust_ubo_access ? nir_var_mem_ubo : 0,
};
progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts);
@ -315,7 +315,7 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
if (compiler->gpu_id < 500)
OPT_V(s, ir3_nir_lower_tg4_to_tex);
ir3_optimize_loop(s);
ir3_optimize_loop(compiler, s);
/* do idiv lowering after first opt loop to get a chance to propagate
* constants for divide by immed power-of-two:
@ -327,7 +327,7 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options);
if (idiv_progress)
ir3_optimize_loop(s);
ir3_optimize_loop(compiler, s);
OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
@ -375,7 +375,7 @@ ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)
*/
OPT_V(s, ir3_nir_apply_trig_workarounds);
ir3_optimize_loop(s);
ir3_optimize_loop(compiler, s);
}
static bool
@ -523,14 +523,14 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);
if (progress)
ir3_optimize_loop(s);
ir3_optimize_loop(so->shader->compiler, s);
/* Fixup indirect load_uniform's which end up with a const base offset
* which is too large to encode. Do this late(ish) so we actually
* can differentiate indirect vs non-indirect.
*/
if (OPT(s, ir3_nir_fixup_load_uniform))
ir3_optimize_loop(s);
ir3_optimize_loop(so->shader->compiler, s);
/* Do late algebraic optimization to turn add(a, neg(b)) back into
* subs, then the mandatory cleanup after algebraic. Note that it may

View File

@ -52,7 +52,7 @@ void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, u
void ir3_nir_lower_gs(nir_shader *shader);
const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
void ir3_optimize_loop(nir_shader *s);
void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
void ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s);
void ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s);
void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);

View File

@ -181,7 +181,7 @@ main(int argc, char **argv)
struct ir3_compiler *c;
int result = 0;
c = ir3_compiler_create(NULL, 630);
c = ir3_compiler_create(NULL, 630, false);
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
const struct test *test = &tests[i];

View File

@ -393,7 +393,7 @@ main(int argc, char **argv)
unsigned gen = test->gpu_id / 100;
if (!compilers[gen]) {
compilers[gen] = ir3_compiler_create(NULL, test->gpu_id);
compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
}
FILE *fasm = fmemopen((void *)test->expected, strlen(test->expected), "r");

View File

@ -1078,6 +1078,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
struct tu_device *device;
bool custom_border_colors = false;
bool perf_query_pools = false;
bool robust_buffer_access2 = false;
/* Check enabled features */
if (pCreateInfo->pEnabledFeatures) {
@ -1110,6 +1111,11 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
perf_query_pools = feature->performanceCounterQueryPools;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
robust_buffer_access2 = features->robustBufferAccess2;
break;
}
default:
break;
}
@ -1166,7 +1172,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
}
}
device->compiler = ir3_compiler_create(NULL, physical_device->gpu_id);
device->compiler = ir3_compiler_create(NULL, physical_device->gpu_id,
robust_buffer_access2);
if (!device->compiler) {
result = vk_startup_errorf(physical_device->instance,
VK_ERROR_INITIALIZATION_FAILED,

View File

@ -192,7 +192,7 @@ tu_spirv_to_nir(struct tu_device *dev,
NIR_PASS_V(nir, nir_lower_frexp);
ir3_optimize_loop(nir);
ir3_optimize_loop(dev->compiler, nir);
return nir;
}

View File

@ -362,7 +362,7 @@ main(int argc, char **argv)
nir_shader *nir;
compiler = ir3_compiler_create(NULL, gpu_id);
compiler = ir3_compiler_create(NULL, gpu_id, false);
if (from_tgsi) {
struct tgsi_token toks[65536];

View File

@ -515,7 +515,7 @@ ir3_screen_init(struct pipe_screen *pscreen)
{
struct fd_screen *screen = fd_screen(pscreen);
screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id);
screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id, false);
/* TODO do we want to limit things to # of fast cores, or just limit
* based on total # of both big and little cores. The little cores