iris: Implement ARB_compute_variable_group_size
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4794>
This commit is contained in:
parent
e645bc6939
commit
33c61eb2f1
|
@ -0,0 +1 @@
|
||||||
|
GL_ARB_compute_variable_group_size on Iris.
|
|
@ -96,6 +96,7 @@ iris_lost_context_state(struct iris_batch *batch)
|
||||||
|
|
||||||
ice->state.dirty = ~0ull;
|
ice->state.dirty = ~0ull;
|
||||||
ice->state.current_hash_scale = 0;
|
ice->state.current_hash_scale = 0;
|
||||||
|
memset(ice->state.last_block, 0, sizeof(ice->state.last_block));
|
||||||
memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
|
memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
|
||||||
batch->last_surface_base_address = ~0ull;
|
batch->last_surface_base_address = ~0ull;
|
||||||
batch->last_aux_map_state = 0;
|
batch->last_aux_map_state = 0;
|
||||||
|
|
|
@ -639,6 +639,9 @@ struct iris_context {
|
||||||
|
|
||||||
bool window_space_position;
|
bool window_space_position;
|
||||||
|
|
||||||
|
/** The last compute group size */
|
||||||
|
uint32_t last_block[3];
|
||||||
|
|
||||||
/** The last compute grid size */
|
/** The last compute grid size */
|
||||||
uint32_t last_grid[3];
|
uint32_t last_grid[3];
|
||||||
/** Reference to the BO containing the compute grid size */
|
/** Reference to the BO containing the compute grid size */
|
||||||
|
|
|
@ -355,6 +355,12 @@ iris_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid)
|
||||||
|
|
||||||
iris_update_compiled_compute_shader(ice);
|
iris_update_compiled_compute_shader(ice);
|
||||||
|
|
||||||
|
if (memcmp(ice->state.last_block, grid->block, sizeof(grid->block)) != 0) {
|
||||||
|
memcpy(ice->state.last_block, grid->block, sizeof(grid->block));
|
||||||
|
ice->state.dirty |= IRIS_DIRTY_CONSTANTS_CS;
|
||||||
|
ice->state.shaders[MESA_SHADER_COMPUTE].sysvals_need_upload = true;
|
||||||
|
}
|
||||||
|
|
||||||
iris_update_grid_size_resource(ice, grid);
|
iris_update_grid_size_resource(ice, grid);
|
||||||
|
|
||||||
iris_binder_reserve_compute(ice);
|
iris_binder_reserve_compute(ice);
|
||||||
|
|
|
@ -393,6 +393,7 @@ iris_setup_uniforms(const struct brw_compiler *compiler,
|
||||||
unsigned patch_vert_idx = -1;
|
unsigned patch_vert_idx = -1;
|
||||||
unsigned ucp_idx[IRIS_MAX_CLIP_PLANES];
|
unsigned ucp_idx[IRIS_MAX_CLIP_PLANES];
|
||||||
unsigned img_idx[PIPE_MAX_SHADER_IMAGES];
|
unsigned img_idx[PIPE_MAX_SHADER_IMAGES];
|
||||||
|
unsigned variable_group_size_idx = -1;
|
||||||
memset(ucp_idx, -1, sizeof(ucp_idx));
|
memset(ucp_idx, -1, sizeof(ucp_idx));
|
||||||
memset(img_idx, -1, sizeof(img_idx));
|
memset(img_idx, -1, sizeof(img_idx));
|
||||||
|
|
||||||
|
@ -516,6 +517,21 @@ iris_setup_uniforms(const struct brw_compiler *compiler,
|
||||||
nir_intrinsic_base(intrin) * 16));
|
nir_intrinsic_base(intrin) * 16));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case nir_intrinsic_load_local_group_size: {
|
||||||
|
assert(nir->info.cs.local_size_variable);
|
||||||
|
if (variable_group_size_idx == -1) {
|
||||||
|
variable_group_size_idx = num_system_values;
|
||||||
|
num_system_values += 3;
|
||||||
|
for (int i = 0; i < 3; i++) {
|
||||||
|
system_values[variable_group_size_idx + i] =
|
||||||
|
BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
b.cursor = nir_before_instr(instr);
|
||||||
|
offset = nir_imm_int(&b, variable_group_size_idx * sizeof(uint32_t));
|
||||||
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1947,6 +1963,11 @@ iris_compile_cs(struct iris_context *ice,
|
||||||
|
|
||||||
nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
|
nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
|
||||||
|
|
||||||
|
if (nir->info.cs.local_size_variable) {
|
||||||
|
nir->info.cs.max_variable_local_size =
|
||||||
|
iris_get_max_var_invocations(screen);
|
||||||
|
}
|
||||||
|
|
||||||
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
|
NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
|
||||||
|
|
||||||
iris_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
|
iris_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
|
||||||
|
|
|
@ -443,6 +443,32 @@ iris_get_shader_param(struct pipe_screen *pscreen,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned
|
||||||
|
get_max_threads(const struct gen_device_info *devinfo)
|
||||||
|
{
|
||||||
|
/* Limit max_threads to 64 for the GPGPU_WALKER command. */
|
||||||
|
return MIN2(64, devinfo->max_cs_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t
|
||||||
|
iris_get_max_var_invocations(const struct iris_screen *screen)
|
||||||
|
{
|
||||||
|
const unsigned max_threads = get_max_threads(&screen->devinfo);
|
||||||
|
|
||||||
|
/* Constants used for ARB_compute_variable_group_size. The compiler will
|
||||||
|
* use the maximum to decide which SIMDs can be used. If we top this like
|
||||||
|
* max_invocations, that would prevent SIMD8 / SIMD16 to be considered.
|
||||||
|
*
|
||||||
|
* TODO: To avoid the trade off above between having the lower maximum
|
||||||
|
* vs. always using SIMD32, keep all three shader variants (for each SIMD)
|
||||||
|
* and select a suitable one at dispatch time.
|
||||||
|
*/
|
||||||
|
const uint32_t max_var_invocations =
|
||||||
|
(max_threads >= 64 ? 8 : (max_threads >= 32 ? 16 : 32)) * max_threads;
|
||||||
|
assert(max_var_invocations >= 512);
|
||||||
|
return max_var_invocations;
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
iris_get_compute_param(struct pipe_screen *pscreen,
|
iris_get_compute_param(struct pipe_screen *pscreen,
|
||||||
enum pipe_shader_ir ir_type,
|
enum pipe_shader_ir ir_type,
|
||||||
|
@ -450,9 +476,8 @@ iris_get_compute_param(struct pipe_screen *pscreen,
|
||||||
void *ret)
|
void *ret)
|
||||||
{
|
{
|
||||||
struct iris_screen *screen = (struct iris_screen *)pscreen;
|
struct iris_screen *screen = (struct iris_screen *)pscreen;
|
||||||
const struct gen_device_info *devinfo = &screen->devinfo;
|
|
||||||
|
|
||||||
const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
|
const unsigned max_threads = get_max_threads(&screen->devinfo);
|
||||||
const uint32_t max_invocations = 32 * max_threads;
|
const uint32_t max_invocations = 32 * max_threads;
|
||||||
|
|
||||||
#define RET(x) do { \
|
#define RET(x) do { \
|
||||||
|
@ -494,13 +519,16 @@ iris_get_compute_param(struct pipe_screen *pscreen,
|
||||||
case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
|
case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
|
||||||
RET((uint32_t []) { BRW_SUBGROUP_SIZE });
|
RET((uint32_t []) { BRW_SUBGROUP_SIZE });
|
||||||
|
|
||||||
|
case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
|
||||||
|
/* MaxComputeVariableGroupInvocations */
|
||||||
|
RET((uint64_t []) { iris_get_max_var_invocations(screen) });
|
||||||
|
|
||||||
case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
|
case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
|
||||||
case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
|
case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
|
||||||
case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
|
case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
|
||||||
case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
|
case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
|
||||||
case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
|
case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
|
||||||
case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
|
case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
|
||||||
case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
|
|
||||||
// XXX: I think these are for Clover...
|
// XXX: I think these are for Clover...
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
|
|
@ -227,4 +227,6 @@ iris_is_format_supported(struct pipe_screen *pscreen,
|
||||||
|
|
||||||
void iris_disk_cache_init(struct iris_screen *screen);
|
void iris_disk_cache_init(struct iris_screen *screen);
|
||||||
|
|
||||||
|
uint32_t iris_get_max_var_invocations(const struct iris_screen *screen);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -3214,6 +3214,10 @@ upload_sysvals(struct iris_context *ice,
|
||||||
value = fui(ice->state.default_inner_level[0]);
|
value = fui(ice->state.default_inner_level[0]);
|
||||||
} else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
|
} else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
|
||||||
value = fui(ice->state.default_inner_level[1]);
|
value = fui(ice->state.default_inner_level[1]);
|
||||||
|
} else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
|
||||||
|
sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
|
||||||
|
unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
|
||||||
|
value = ice->state.last_block[i];
|
||||||
} else {
|
} else {
|
||||||
assert(!"unhandled system value");
|
assert(!"unhandled system value");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue