i965: Implement ARB_compute_variable_group_size
This patch adds the implementation of ARB_compute_variable_group_size for i965. We do this by storing the local group size in a push constant. Additional changes made by Caio Marcelo de Oliveira Filho. Signed-off-by: Plamena Manolova <plamena.manolova@intel.com> Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4504>
This commit is contained in:
parent
c77dc51203
commit
5664bd6db3
|
@ -299,7 +299,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
|
|||
|
||||
GL_ARB_bindless_texture DONE (nvc0, radeonsi)
|
||||
GL_ARB_cl_event not started
|
||||
GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi)
|
||||
GL_ARB_compute_variable_group_size DONE (i965/gen7+, nvc0, radeonsi)
|
||||
GL_ARB_ES3_2_compatibility DONE (i965/gen8+, radeonsi, virgl)
|
||||
GL_ARB_fragment_shader_interlock DONE (i965)
|
||||
GL_ARB_gpu_shader_int64 DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
GL_ARB_compute_variable_group_size on i965.
|
||||
GL_EXT_texture_shadow_lod on radeonsi.
|
||||
GL_NV_copy_image on all gallium drivers.
|
||||
VK_KHR_shader_non_semantic_info on Intel, RADV.
|
||||
|
|
|
@ -101,6 +101,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) {
|
|||
|
||||
brw->compute.num_work_groups_bo = NULL;
|
||||
brw->compute.num_work_groups = num_groups;
|
||||
brw->compute.group_size = NULL;
|
||||
ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
|
||||
|
||||
brw_dispatch_compute_common(ctx);
|
||||
|
@ -120,6 +121,22 @@ brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect)
|
|||
brw->compute.num_work_groups_bo = bo;
|
||||
brw->compute.num_work_groups_offset = indirect;
|
||||
brw->compute.num_work_groups = indirect_group_counts;
|
||||
brw->compute.group_size = NULL;
|
||||
ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
|
||||
|
||||
brw_dispatch_compute_common(ctx);
|
||||
}
|
||||
|
||||
static void
|
||||
brw_dispatch_compute_group_size(struct gl_context *ctx,
|
||||
const GLuint *num_groups,
|
||||
const GLuint *group_size)
|
||||
{
|
||||
struct brw_context *brw = brw_context(ctx);
|
||||
|
||||
brw->compute.num_work_groups_bo = NULL;
|
||||
brw->compute.num_work_groups = num_groups;
|
||||
brw->compute.group_size = group_size;
|
||||
ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
|
||||
|
||||
brw_dispatch_compute_common(ctx);
|
||||
|
@ -130,4 +147,5 @@ brw_init_compute_functions(struct dd_function_table *functions)
|
|||
{
|
||||
functions->DispatchCompute = brw_dispatch_compute;
|
||||
functions->DispatchComputeIndirect = brw_dispatch_compute_indirect;
|
||||
functions->DispatchComputeGroupSize = brw_dispatch_compute_group_size;
|
||||
}
|
||||
|
|
|
@ -843,6 +843,24 @@ brw_initialize_cs_context_constants(struct brw_context *brw)
|
|||
ctx->Const.MaxComputeWorkGroupSize[2] = max_invocations;
|
||||
ctx->Const.MaxComputeWorkGroupInvocations = max_invocations;
|
||||
ctx->Const.MaxComputeSharedMemorySize = 64 * 1024;
|
||||
|
||||
/* Constants used for ARB_compute_variable_group_size. The compiler will
|
||||
* use the maximum to decide which SIMDs can be used. If we top this like
|
||||
* max_invocations, that would prevent SIMD8 / SIMD16 to be considered.
|
||||
*
|
||||
* TODO: To avoid the trade off above between having the lower maximum
|
||||
* vs. always using SIMD32, keep all three shader variants (for each SIMD)
|
||||
* and select a suitable one at dispatch time.
|
||||
*/
|
||||
if (devinfo->gen >= 7) {
|
||||
const uint32_t max_var_invocations =
|
||||
(max_threads >= 64 ? 8 : (max_threads >= 32 ? 16 : 32)) * max_threads;
|
||||
assert(max_var_invocations >= 512);
|
||||
ctx->Const.MaxComputeVariableGroupSize[0] = max_var_invocations;
|
||||
ctx->Const.MaxComputeVariableGroupSize[1] = max_var_invocations;
|
||||
ctx->Const.MaxComputeVariableGroupSize[2] = max_var_invocations;
|
||||
ctx->Const.MaxComputeVariableGroupInvocations = max_var_invocations;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -925,6 +925,11 @@ struct brw_context
|
|||
struct brw_bo *num_work_groups_bo;
|
||||
GLintptr num_work_groups_offset;
|
||||
const GLuint *num_work_groups;
|
||||
/**
|
||||
* This is only used alongside ARB_compute_variable_group_size when the
|
||||
* local work group size is variable, otherwise it's NULL.
|
||||
*/
|
||||
const GLuint *group_size;
|
||||
} compute;
|
||||
|
||||
struct {
|
||||
|
|
|
@ -32,6 +32,27 @@
|
|||
#include "brw_program.h"
|
||||
#include "compiler/glsl/ir_uniform.h"
|
||||
|
||||
uint32_t
|
||||
brw_cs_group_size(const struct brw_context *brw)
|
||||
{
|
||||
assert(brw->cs.base.prog_data);
|
||||
struct brw_cs_prog_data *cs_prog_data =
|
||||
brw_cs_prog_data(brw->cs.base.prog_data);
|
||||
|
||||
if (brw->compute.group_size) {
|
||||
/* With ARB_compute_variable_group_size the group size is set at
|
||||
* dispatch time, so we can't use the one provided by the compiler.
|
||||
*/
|
||||
return brw->compute.group_size[0] *
|
||||
brw->compute.group_size[1] *
|
||||
brw->compute.group_size[2];
|
||||
} else {
|
||||
return cs_prog_data->local_size[0] *
|
||||
cs_prog_data->local_size[1] *
|
||||
cs_prog_data->local_size[2];
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
assign_cs_binding_table_offsets(const struct gen_device_info *devinfo,
|
||||
const struct gl_program *prog,
|
||||
|
@ -58,6 +79,7 @@ brw_codegen_cs_prog(struct brw_context *brw,
|
|||
struct brw_cs_prog_data prog_data;
|
||||
bool start_busy = false;
|
||||
double start_time = 0;
|
||||
struct gl_context *gl_ctx = &brw->ctx;
|
||||
nir_shader *nir = nir_shader_clone(mem_ctx, cp->program.nir);
|
||||
|
||||
memset(&prog_data, 0, sizeof(prog_data));
|
||||
|
@ -88,6 +110,17 @@ brw_codegen_cs_prog(struct brw_context *brw,
|
|||
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
||||
st_index = brw_get_shader_time_index(brw, &cp->program, ST_CS, true);
|
||||
|
||||
/* If the work group size is variable we set it to the maximum here since
|
||||
* the actual size is not known until the dispatch command is issued.
|
||||
*/
|
||||
if (nir->info.cs.local_size_variable) {
|
||||
prog_data.uses_variable_group_size = true;
|
||||
nir->info.cs.max_variable_local_size =
|
||||
gl_ctx->Const.MaxComputeWorkGroupSize[2];
|
||||
} else {
|
||||
prog_data.uses_variable_group_size = false;
|
||||
}
|
||||
|
||||
char *error_str;
|
||||
program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key,
|
||||
&prog_data, nir, st_index, NULL, &error_str);
|
||||
|
|
|
@ -29,6 +29,9 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
uint32_t
|
||||
brw_cs_group_size(const struct brw_context *brw);
|
||||
|
||||
void
|
||||
brw_upload_cs_prog(struct brw_context *brw);
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
*/
|
||||
|
||||
#include "brw_context.h"
|
||||
#include "brw_cs.h"
|
||||
#include "brw_state.h"
|
||||
#include "brw_defines.h"
|
||||
#include "brw_program.h"
|
||||
|
@ -62,6 +63,10 @@ brw_param_value(struct brw_context *brw,
|
|||
return f_as_u32(ctx->TessCtrlProgram.patch_default_inner_level[0]);
|
||||
} else if (param == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
|
||||
return f_as_u32(ctx->TessCtrlProgram.patch_default_inner_level[1]);
|
||||
} else if (param >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
|
||||
param <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
|
||||
unsigned i = param - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
|
||||
return brw->compute.group_size[i];
|
||||
} else {
|
||||
unreachable("Invalid param builtin");
|
||||
}
|
||||
|
@ -303,8 +308,11 @@ brw_upload_cs_push_constants(struct brw_context *brw,
|
|||
/* XXX: Should this happen somewhere before to get our state flag set? */
|
||||
_mesa_load_state_parameters(ctx, prog->Parameters);
|
||||
|
||||
const unsigned threads =
|
||||
DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size);
|
||||
const unsigned push_const_size =
|
||||
brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads);
|
||||
brw_cs_push_const_total_size(cs_prog_data, threads);
|
||||
|
||||
if (push_const_size == 0) {
|
||||
stage_state->push_const_size = 0;
|
||||
return;
|
||||
|
@ -330,7 +338,7 @@ brw_upload_cs_push_constants(struct brw_context *brw,
|
|||
}
|
||||
|
||||
if (cs_prog_data->push.per_thread.size > 0) {
|
||||
for (unsigned t = 0; t < cs_prog_data->threads; t++) {
|
||||
for (unsigned t = 0; t < threads; t++) {
|
||||
unsigned dst =
|
||||
8 * (cs_prog_data->push.per_thread.regs * t +
|
||||
cs_prog_data->push.cross_thread.regs);
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
#include "genX_boilerplate.h"
|
||||
|
||||
#include "brw_context.h"
|
||||
#include "brw_cs.h"
|
||||
#include "brw_draw.h"
|
||||
#include "brw_multisample_state.h"
|
||||
#include "brw_state.h"
|
||||
|
@ -4263,6 +4264,12 @@ genX(upload_cs_state)(struct brw_context *brw)
|
|||
struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
|
||||
const struct gen_device_info *devinfo = &brw->screen->devinfo;
|
||||
|
||||
const unsigned threads =
|
||||
DIV_ROUND_UP(brw_cs_group_size(brw), cs_prog_data->simd_size);
|
||||
|
||||
if (!cs_prog_data->uses_variable_group_size)
|
||||
assert(cs_prog_data->threads == threads);
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
|
||||
brw_emit_buffer_surface_state(
|
||||
brw, &stage_state->surf_offset[
|
||||
|
@ -4353,13 +4360,13 @@ genX(upload_cs_state)(struct brw_context *brw)
|
|||
vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
|
||||
|
||||
const uint32_t vfe_curbe_allocation =
|
||||
ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
|
||||
ALIGN(cs_prog_data->push.per_thread.regs * threads +
|
||||
cs_prog_data->push.cross_thread.regs, 2);
|
||||
vfe.CURBEAllocationSize = vfe_curbe_allocation;
|
||||
}
|
||||
|
||||
const unsigned push_const_size =
|
||||
brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads);
|
||||
brw_cs_push_const_total_size(cs_prog_data, threads);
|
||||
if (push_const_size > 0) {
|
||||
brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
|
||||
curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
|
||||
|
@ -4378,7 +4385,7 @@ genX(upload_cs_state)(struct brw_context *brw)
|
|||
DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
|
||||
.BindingTablePointer = stage_state->bind_bo_offset,
|
||||
.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
|
||||
.NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
|
||||
.NumberofThreadsinGPGPUThreadGroup = threads,
|
||||
.SharedLocalMemorySize = encode_slm_size(GEN_GEN,
|
||||
prog_data->total_shared),
|
||||
.BarrierEnable = cs_prog_data->uses_barrier,
|
||||
|
@ -4484,9 +4491,9 @@ genX(emit_gpgpu_walker)(struct brw_context *brw)
|
|||
if (indirect)
|
||||
prepare_indirect_gpgpu_walker(brw);
|
||||
|
||||
const unsigned group_size = brw_cs_group_size(brw);
|
||||
const unsigned simd_size = prog_data->simd_size;
|
||||
unsigned group_size = prog_data->local_size[0] *
|
||||
prog_data->local_size[1] * prog_data->local_size[2];
|
||||
unsigned thread_width_max = DIV_ROUND_UP(group_size, simd_size);
|
||||
|
||||
uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
|
||||
const unsigned right_non_aligned = group_size & (simd_size - 1);
|
||||
|
@ -4499,7 +4506,7 @@ genX(emit_gpgpu_walker)(struct brw_context *brw)
|
|||
ggw.SIMDSize = prog_data->simd_size / 16;
|
||||
ggw.ThreadDepthCounterMaximum = 0;
|
||||
ggw.ThreadHeightCounterMaximum = 0;
|
||||
ggw.ThreadWidthCounterMaximum = prog_data->threads - 1;
|
||||
ggw.ThreadWidthCounterMaximum = thread_width_max - 1;
|
||||
ggw.ThreadGroupIDXDimension = num_groups[0];
|
||||
ggw.ThreadGroupIDYDimension = num_groups[1];
|
||||
ggw.ThreadGroupIDZDimension = num_groups[2];
|
||||
|
|
|
@ -271,6 +271,7 @@ intelInitExtensions(struct gl_context *ctx)
|
|||
ctx->Extensions.ARB_ES3_1_compatibility =
|
||||
devinfo->gen >= 8 || devinfo->is_haswell;
|
||||
ctx->Extensions.NV_compute_shader_derivatives = true;
|
||||
ctx->Extensions.ARB_compute_variable_group_size = true;
|
||||
}
|
||||
|
||||
if (can_do_predicate_writes(brw->screen)) {
|
||||
|
|
Loading…
Reference in New Issue