intel/blorp: Emit compute program based on BLORP_BATCH_USE_COMPUTE
Reworks: * Don't pack params, just memcpy param struct (s-b Jason) * Old subject: "intel/blorp: Emit compute program if params.cs_prog_data is set" * Various cleanups of push-const size/alignment (s-b Jason) * Fix subslice count by moving to devinfo (s-b Ken) * Simplify cw.InterfaceDescriptor code (s-b Ken) * Drop some comments from i965 (s-b Ken) Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11564>
This commit is contained in:
parent
3e13c4ccf2
commit
90a39cac87
|
@ -78,6 +78,11 @@ enum blorp_batch_flags {
|
|||
* color buffer.
|
||||
*/
|
||||
BLORP_BATCH_NO_UPDATE_CLEAR_COLOR = (1 << 2),
|
||||
|
||||
/* This flag indicates that blorp should use a compute program for the
|
||||
* operation.
|
||||
*/
|
||||
BLORP_BATCH_USE_COMPUTE = (1 << 3),
|
||||
};
|
||||
|
||||
struct blorp_batch {
|
||||
|
|
|
@ -1970,17 +1970,8 @@ blorp_update_clear_color(UNUSED struct blorp_batch *batch,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Execute a blit or render pass operation.
|
||||
*
|
||||
* To execute the operation, this function manually constructs and emits a
|
||||
* batch to draw a rectangle primitive. The batchbuffer is flushed before
|
||||
* constructing and after emitting the batch.
|
||||
*
|
||||
* This function alters no GL state.
|
||||
*/
|
||||
static void
|
||||
blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
|
||||
blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
|
||||
{
|
||||
if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
|
||||
blorp_update_clear_color(batch, ¶ms->dst, params->fast_clear_op);
|
||||
|
@ -2017,4 +2008,262 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
|
|||
}
|
||||
}
|
||||
|
||||
#if GFX_VER >= 7
|
||||
|
||||
static void
|
||||
blorp_get_compute_push_const(struct blorp_batch *batch,
|
||||
const struct blorp_params *params,
|
||||
uint32_t threads,
|
||||
uint32_t *state_offset,
|
||||
unsigned *state_size)
|
||||
{
|
||||
const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
|
||||
const unsigned push_const_size =
|
||||
ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
|
||||
assert(cs_prog_data->push.cross_thread.size +
|
||||
cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
|
||||
|
||||
if (push_const_size == 0) {
|
||||
*state_offset = 0;
|
||||
*state_size = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t push_const_offset;
|
||||
uint32_t *push_const =
|
||||
GFX_VERx10 >= 125 ?
|
||||
blorp_alloc_general_state(batch, push_const_size, 64,
|
||||
&push_const_offset) :
|
||||
blorp_alloc_dynamic_state(batch, push_const_size, 64,
|
||||
&push_const_offset);
|
||||
memset(push_const, 0x0, push_const_size);
|
||||
|
||||
void *dst = push_const;
|
||||
const void *src = (char *)¶ms->wm_inputs;
|
||||
|
||||
if (cs_prog_data->push.cross_thread.size > 0) {
|
||||
memcpy(dst, src, cs_prog_data->push.cross_thread.size);
|
||||
dst += cs_prog_data->push.cross_thread.size;
|
||||
src += cs_prog_data->push.cross_thread.size;
|
||||
}
|
||||
|
||||
assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0);
|
||||
#if GFX_VERx10 < 125
|
||||
if (cs_prog_data->push.per_thread.size > 0) {
|
||||
for (unsigned t = 0; t < threads; t++) {
|
||||
memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
|
||||
|
||||
uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
|
||||
*subgroup_id = t;
|
||||
|
||||
dst += cs_prog_data->push.per_thread.size;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
*state_offset = push_const_offset;
|
||||
*state_size = push_const_size;
|
||||
}
|
||||
|
||||
#endif /* GFX_VER >= 7 */
|
||||
|
||||
static void
|
||||
blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
|
||||
{
|
||||
assert(!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR));
|
||||
assert(params->hiz_op == ISL_AUX_OP_NONE);
|
||||
|
||||
#if GFX_VER >= 7
|
||||
|
||||
const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
|
||||
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
|
||||
const struct brw_cs_dispatch_info dispatch =
|
||||
brw_cs_get_dispatch_info(batch->blorp->compiler->devinfo, cs_prog_data,
|
||||
NULL);
|
||||
const struct intel_device_info *devinfo = batch->blorp->compiler->devinfo;
|
||||
|
||||
uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
|
||||
uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
|
||||
uint32_t group_z0 = params->dst.z_offset;
|
||||
uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
|
||||
uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
|
||||
assert(params->num_layers >= 1);
|
||||
uint32_t group_z1 = params->dst.z_offset + params->num_layers;
|
||||
assert(cs_prog_data->local_size[2] == 1);
|
||||
|
||||
#endif /* GFX_VER >= 7 */
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
|
||||
blorp_emit(batch, GENX(CFE_STATE), cfe) {
|
||||
cfe.MaximumNumberofThreads =
|
||||
devinfo->max_cs_threads * devinfo->subslice_total - 1;
|
||||
}
|
||||
|
||||
assert(cs_prog_data->push.per_thread.regs == 0);
|
||||
blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
|
||||
cw.SIMDSize = dispatch.simd_size / 16;
|
||||
cw.LocalXMaximum = cs_prog_data->local_size[0] - 1;
|
||||
cw.LocalYMaximum = cs_prog_data->local_size[1] - 1;
|
||||
cw.LocalZMaximum = cs_prog_data->local_size[2] - 1;
|
||||
cw.ThreadGroupIDStartingX = group_x0;
|
||||
cw.ThreadGroupIDStartingY = group_y0;
|
||||
cw.ThreadGroupIDStartingZ = group_z0;
|
||||
cw.ThreadGroupIDXDimension = group_x1;
|
||||
cw.ThreadGroupIDYDimension = group_y1;
|
||||
cw.ThreadGroupIDZDimension = group_z1;
|
||||
cw.ExecutionMask = 0xffffffff;
|
||||
|
||||
uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
|
||||
|
||||
uint32_t samplers_offset =
|
||||
params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
|
||||
|
||||
uint32_t push_const_offset;
|
||||
unsigned push_const_size;
|
||||
blorp_get_compute_push_const(batch, params, dispatch.threads,
|
||||
&push_const_offset, &push_const_size);
|
||||
cw.IndirectDataStartAddress = push_const_offset;
|
||||
cw.IndirectDataLength = push_const_size;
|
||||
|
||||
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
|
||||
.KernelStartPointer = params->cs_prog_kernel,
|
||||
.SamplerStatePointer = samplers_offset,
|
||||
.SamplerCount = params->src.enabled ? 1 : 0,
|
||||
.BindingTableEntryCount = params->src.enabled ? 2 : 1,
|
||||
.BindingTablePointer = surfaces_offset,
|
||||
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
|
||||
.SharedLocalMemorySize =
|
||||
encode_slm_size(GFX_VER, prog_data->total_shared),
|
||||
.NumberOfBarriers = cs_prog_data->uses_barrier,
|
||||
};
|
||||
}
|
||||
|
||||
#elif GFX_VER >= 7
|
||||
|
||||
/* The MEDIA_VFE_STATE documentation for Gfx8+ says:
|
||||
*
|
||||
* "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
|
||||
* the only bits that are changed are scoreboard related: Scoreboard
|
||||
* Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
|
||||
* these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
|
||||
*
|
||||
* Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
|
||||
* but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
|
||||
*/
|
||||
blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
|
||||
pc.CommandStreamerStallEnable = true;
|
||||
pc.StallAtPixelScoreboard = true;
|
||||
}
|
||||
|
||||
blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
|
||||
assert(prog_data->total_scratch == 0);
|
||||
vfe.MaximumNumberofThreads =
|
||||
devinfo->max_cs_threads * devinfo->subslice_total - 1;
|
||||
vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0;
|
||||
#if GFX_VER < 11
|
||||
vfe.ResetGatewayTimer =
|
||||
Resettingrelativetimerandlatchingtheglobaltimestamp;
|
||||
#endif
|
||||
#if GFX_VER < 9
|
||||
vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
|
||||
#endif
|
||||
#if GFX_VER == 7
|
||||
vfe.GPGPUMode = 1;
|
||||
#endif
|
||||
vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0;
|
||||
|
||||
const uint32_t vfe_curbe_allocation =
|
||||
ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
|
||||
cs_prog_data->push.cross_thread.regs, 2);
|
||||
vfe.CURBEAllocationSize = vfe_curbe_allocation;
|
||||
}
|
||||
|
||||
uint32_t push_const_offset;
|
||||
unsigned push_const_size;
|
||||
blorp_get_compute_push_const(batch, params, dispatch.threads,
|
||||
&push_const_offset, &push_const_size);
|
||||
|
||||
blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
|
||||
curbe.CURBETotalDataLength = push_const_size;
|
||||
curbe.CURBEDataStartAddress = push_const_offset;
|
||||
}
|
||||
|
||||
uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
|
||||
|
||||
uint32_t samplers_offset =
|
||||
params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
|
||||
|
||||
struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
|
||||
.KernelStartPointer = params->cs_prog_kernel,
|
||||
.SamplerStatePointer = samplers_offset,
|
||||
.SamplerCount = params->src.enabled ? 1 : 0,
|
||||
.BindingTableEntryCount = params->src.enabled ? 2 : 1,
|
||||
.BindingTablePointer = surfaces_offset,
|
||||
.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
|
||||
.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
|
||||
.SharedLocalMemorySize = encode_slm_size(GFX_VER,
|
||||
prog_data->total_shared),
|
||||
.BarrierEnable = cs_prog_data->uses_barrier,
|
||||
#if GFX_VER >= 8 || GEN_IS_HASWELL
|
||||
.CrossThreadConstantDataReadLength =
|
||||
cs_prog_data->push.cross_thread.regs,
|
||||
#endif
|
||||
};
|
||||
|
||||
uint32_t idd_offset;
|
||||
uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
|
||||
void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
|
||||
GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
|
||||
|
||||
blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
|
||||
mid.InterfaceDescriptorTotalLength = size;
|
||||
mid.InterfaceDescriptorDataStartAddress = idd_offset;
|
||||
}
|
||||
|
||||
blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
|
||||
ggw.SIMDSize = dispatch.simd_size / 16;
|
||||
ggw.ThreadDepthCounterMaximum = 0;
|
||||
ggw.ThreadHeightCounterMaximum = 0;
|
||||
ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
|
||||
ggw.ThreadGroupIDStartingX = group_x0;
|
||||
ggw.ThreadGroupIDStartingY = group_y0;
|
||||
#if GFX_VER >= 8
|
||||
ggw.ThreadGroupIDStartingResumeZ = group_z0;
|
||||
#else
|
||||
ggw.ThreadGroupIDStartingZ = group_z0;
|
||||
#endif
|
||||
ggw.ThreadGroupIDXDimension = group_x1;
|
||||
ggw.ThreadGroupIDYDimension = group_y1;
|
||||
ggw.ThreadGroupIDZDimension = group_z1;
|
||||
ggw.RightExecutionMask = dispatch.right_mask;
|
||||
ggw.BottomExecutionMask = 0xffffffff;
|
||||
}
|
||||
|
||||
#else /* GFX_VER >= 7 */
|
||||
|
||||
unreachable("Compute blorp is not supported on SNB and earlier");
|
||||
|
||||
#endif /* GFX_VER >= 7 */
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* \brief Execute a blit or render pass operation.
|
||||
*
|
||||
* To execute the operation, this function manually constructs and emits a
|
||||
* batch to draw a rectangle primitive. The batchbuffer is flushed before
|
||||
* constructing and after emitting the batch.
|
||||
*
|
||||
* This function alters no GL state.
|
||||
*/
|
||||
static void
|
||||
blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
|
||||
{
|
||||
if (batch->flags & BLORP_BATCH_USE_COMPUTE)
|
||||
blorp_exec_compute(batch, params);
|
||||
else
|
||||
blorp_exec_3d(batch, params);
|
||||
}
|
||||
|
||||
#endif /* BLORP_GENX_EXEC_H */
|
||||
|
|
Loading…
Reference in New Issue