intel/cs: Re-run final NIR optimizations for each SIMD size
With the advent of SPIR-V subgroup operations, compute shaders will have to be slightly different depending on the SIMD size at which they execute. In order to allow us to do dispatch-width specific things in NIR, we re-run the final NIR stages for each sIMD width. One side-effect of this change is that we start rallocing fs_visitors which means we need DECLARE_RALLOC_CXX_OPERATORS. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
parent
4e79a77cdc
commit
6411defdcd
|
@ -6824,6 +6824,20 @@ cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
|
||||||
cs_prog_data->threads = (group_size + size - 1) / size;
|
cs_prog_data->threads = (group_size + size - 1) / size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static nir_shader *
|
||||||
|
compile_cs_to_nir(const struct brw_compiler *compiler,
|
||||||
|
void *mem_ctx,
|
||||||
|
const struct brw_cs_prog_key *key,
|
||||||
|
struct brw_cs_prog_data *prog_data,
|
||||||
|
const nir_shader *src_shader,
|
||||||
|
unsigned dispatch_width)
|
||||||
|
{
|
||||||
|
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
|
||||||
|
shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
|
||||||
|
brw_nir_lower_cs_intrinsics(shader);
|
||||||
|
return brw_postprocess_nir(shader, compiler, true);
|
||||||
|
}
|
||||||
|
|
||||||
const unsigned *
|
const unsigned *
|
||||||
brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
||||||
void *mem_ctx,
|
void *mem_ctx,
|
||||||
|
@ -6833,17 +6847,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
||||||
int shader_time_index,
|
int shader_time_index,
|
||||||
char **error_str)
|
char **error_str)
|
||||||
{
|
{
|
||||||
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
|
prog_data->local_size[0] = src_shader->info.cs.local_size[0];
|
||||||
shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
|
prog_data->local_size[1] = src_shader->info.cs.local_size[1];
|
||||||
brw_nir_lower_cs_intrinsics(shader);
|
prog_data->local_size[2] = src_shader->info.cs.local_size[2];
|
||||||
shader = brw_postprocess_nir(shader, compiler, true);
|
|
||||||
|
|
||||||
prog_data->local_size[0] = shader->info.cs.local_size[0];
|
|
||||||
prog_data->local_size[1] = shader->info.cs.local_size[1];
|
|
||||||
prog_data->local_size[2] = shader->info.cs.local_size[2];
|
|
||||||
unsigned local_workgroup_size =
|
unsigned local_workgroup_size =
|
||||||
shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
|
src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
|
||||||
shader->info.cs.local_size[2];
|
src_shader->info.cs.local_size[2];
|
||||||
|
|
||||||
unsigned min_dispatch_width =
|
unsigned min_dispatch_width =
|
||||||
DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
|
DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
|
||||||
|
@ -6851,39 +6860,47 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
||||||
min_dispatch_width = util_next_power_of_two(min_dispatch_width);
|
min_dispatch_width = util_next_power_of_two(min_dispatch_width);
|
||||||
assert(min_dispatch_width <= 32);
|
assert(min_dispatch_width <= 32);
|
||||||
|
|
||||||
|
fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
|
||||||
cfg_t *cfg = NULL;
|
cfg_t *cfg = NULL;
|
||||||
const char *fail_msg = NULL;
|
const char *fail_msg = NULL;
|
||||||
|
unsigned promoted_constants;
|
||||||
|
|
||||||
/* Now the main event: Visit the shader IR and generate our CS IR for it.
|
/* Now the main event: Visit the shader IR and generate our CS IR for it.
|
||||||
*/
|
*/
|
||||||
fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
|
|
||||||
NULL, /* Never used in core profile */
|
|
||||||
shader, 8, shader_time_index);
|
|
||||||
if (min_dispatch_width <= 8) {
|
if (min_dispatch_width <= 8) {
|
||||||
if (!v8.run_cs(min_dispatch_width)) {
|
nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
|
||||||
fail_msg = v8.fail_msg;
|
prog_data, src_shader, 8);
|
||||||
|
v8 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
|
||||||
|
NULL, /* Never used in core profile */
|
||||||
|
nir8, 8, shader_time_index);
|
||||||
|
if (!v8->run_cs(min_dispatch_width)) {
|
||||||
|
fail_msg = v8->fail_msg;
|
||||||
} else {
|
} else {
|
||||||
/* We should always be able to do SIMD32 for compute shaders */
|
/* We should always be able to do SIMD32 for compute shaders */
|
||||||
assert(v8.max_dispatch_width >= 32);
|
assert(v8->max_dispatch_width >= 32);
|
||||||
|
|
||||||
cfg = v8.cfg;
|
cfg = v8->cfg;
|
||||||
cs_set_simd_size(prog_data, 8);
|
cs_set_simd_size(prog_data, 8);
|
||||||
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
||||||
|
promoted_constants = v8->promoted_constants;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
|
|
||||||
NULL, /* Never used in core profile */
|
|
||||||
shader, 16, shader_time_index);
|
|
||||||
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
|
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
|
||||||
!fail_msg && min_dispatch_width <= 16) {
|
!fail_msg && min_dispatch_width <= 16) {
|
||||||
/* Try a SIMD16 compile */
|
/* Try a SIMD16 compile */
|
||||||
if (min_dispatch_width <= 8)
|
nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
|
||||||
v16.import_uniforms(&v8);
|
prog_data, src_shader, 16);
|
||||||
if (!v16.run_cs(min_dispatch_width)) {
|
v16 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
|
||||||
|
NULL, /* Never used in core profile */
|
||||||
|
nir16, 16, shader_time_index);
|
||||||
|
if (v8)
|
||||||
|
v16->import_uniforms(v8);
|
||||||
|
|
||||||
|
if (!v16->run_cs(min_dispatch_width)) {
|
||||||
compiler->shader_perf_log(log_data,
|
compiler->shader_perf_log(log_data,
|
||||||
"SIMD16 shader failed to compile: %s",
|
"SIMD16 shader failed to compile: %s",
|
||||||
v16.fail_msg);
|
v16->fail_msg);
|
||||||
if (!cfg) {
|
if (!cfg) {
|
||||||
fail_msg =
|
fail_msg =
|
||||||
"Couldn't generate SIMD16 program and not "
|
"Couldn't generate SIMD16 program and not "
|
||||||
|
@ -6891,37 +6908,44 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* We should always be able to do SIMD32 for compute shaders */
|
/* We should always be able to do SIMD32 for compute shaders */
|
||||||
assert(v16.max_dispatch_width >= 32);
|
assert(v16->max_dispatch_width >= 32);
|
||||||
|
|
||||||
cfg = v16.cfg;
|
cfg = v16->cfg;
|
||||||
cs_set_simd_size(prog_data, 16);
|
cs_set_simd_size(prog_data, 16);
|
||||||
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
||||||
|
promoted_constants = v16->promoted_constants;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base,
|
/* We should always be able to do SIMD32 for compute shaders */
|
||||||
NULL, /* Never used in core profile */
|
assert(!v16 || v16->max_dispatch_width >= 32);
|
||||||
shader, 32, shader_time_index);
|
|
||||||
if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
|
if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
|
||||||
/* Try a SIMD32 compile */
|
/* Try a SIMD32 compile */
|
||||||
if (min_dispatch_width <= 8)
|
nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
|
||||||
v32.import_uniforms(&v8);
|
prog_data, src_shader, 32);
|
||||||
else if (min_dispatch_width <= 16)
|
v32 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
|
||||||
v32.import_uniforms(&v16);
|
NULL, /* Never used in core profile */
|
||||||
|
nir32, 32, shader_time_index);
|
||||||
|
if (v8)
|
||||||
|
v32->import_uniforms(v8);
|
||||||
|
else if (v16)
|
||||||
|
v32->import_uniforms(v16);
|
||||||
|
|
||||||
if (!v32.run_cs(min_dispatch_width)) {
|
if (!v32->run_cs(min_dispatch_width)) {
|
||||||
compiler->shader_perf_log(log_data,
|
compiler->shader_perf_log(log_data,
|
||||||
"SIMD32 shader failed to compile: %s",
|
"SIMD32 shader failed to compile: %s",
|
||||||
v16.fail_msg);
|
v16->fail_msg);
|
||||||
if (!cfg) {
|
if (!cfg) {
|
||||||
fail_msg =
|
fail_msg =
|
||||||
"Couldn't generate SIMD32 program and not "
|
"Couldn't generate SIMD32 program and not "
|
||||||
"enough threads for SIMD16";
|
"enough threads for SIMD16";
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
cfg = v32.cfg;
|
cfg = v32->cfg;
|
||||||
cs_set_simd_size(prog_data, 32);
|
cs_set_simd_size(prog_data, 32);
|
||||||
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
||||||
|
promoted_constants = v32->promoted_constants;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6932,12 +6956,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
||||||
*error_str = ralloc_strdup(mem_ctx, fail_msg);
|
*error_str = ralloc_strdup(mem_ctx, fail_msg);
|
||||||
} else {
|
} else {
|
||||||
fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
|
fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
|
||||||
v8.promoted_constants, false, MESA_SHADER_COMPUTE);
|
promoted_constants, false, MESA_SHADER_COMPUTE);
|
||||||
if (INTEL_DEBUG & DEBUG_CS) {
|
if (INTEL_DEBUG & DEBUG_CS) {
|
||||||
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
|
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
|
||||||
shader->info.label ? shader->info.label :
|
src_shader->info.label ?
|
||||||
"unnamed",
|
src_shader->info.label : "unnamed",
|
||||||
shader->info.name);
|
src_shader->info.name);
|
||||||
g.enable_debug(name);
|
g.enable_debug(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6946,6 +6970,10 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
||||||
ret = g.get_assembly(&prog_data->base.program_size);
|
ret = g.get_assembly(&prog_data->base.program_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delete v8;
|
||||||
|
delete v16;
|
||||||
|
delete v32;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue