From 90a39cac87f415375a70e1cb2f7ba2c486f941e4 Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Fri, 26 Oct 2018 12:52:44 -0700 Subject: [PATCH] intel/blorp: Emit compute program based on BLORP_BATCH_USE_COMPUTE Reworks: * Don't pack params, just memcpy param struct (s-b Jason) * Old subject: "intel/blorp: Emit compute program if params.cs_prog_data is set" * Various cleanups of push-const size/alignment (s-b Jason) * Fix subslice count by moving to devinfo (s-b Ken) * Simplify cw.InterfaceDescriptor code (s-b Ken) * Drop some comments from i965 (s-b Ken) Signed-off-by: Jordan Justen Reviewed-by: Jason Ekstrand Part-of: --- src/intel/blorp/blorp.h | 5 + src/intel/blorp/blorp_genX_exec.h | 269 ++++++++++++++++++++++++++++-- 2 files changed, 264 insertions(+), 10 deletions(-) diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h index 4b5d5950b59..1e40b4191ae 100644 --- a/src/intel/blorp/blorp.h +++ b/src/intel/blorp/blorp.h @@ -78,6 +78,11 @@ enum blorp_batch_flags { * color buffer. */ BLORP_BATCH_NO_UPDATE_CLEAR_COLOR = (1 << 2), + + /* This flag indicates that blorp should use a compute program for the + * operation. + */ + BLORP_BATCH_USE_COMPUTE = (1 << 3), }; struct blorp_batch { diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h index 6bb0b63e474..9cb89224466 100644 --- a/src/intel/blorp/blorp_genX_exec.h +++ b/src/intel/blorp/blorp_genX_exec.h @@ -1970,17 +1970,8 @@ blorp_update_clear_color(UNUSED struct blorp_batch *batch, } } -/** - * \brief Execute a blit or render pass operation. - * - * To execute the operation, this function manually constructs and emits a - * batch to draw a rectangle primitive. The batchbuffer is flushed before - * constructing and after emitting the batch. - * - * This function alters no GL state. - */ static void -blorp_exec(struct blorp_batch *batch, const struct blorp_params *params) +blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params) { if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) { blorp_update_clear_color(batch, ¶ms->dst, params->fast_clear_op); @@ -2017,4 +2008,262 @@ blorp_exec(struct blorp_batch *batch, const struct blorp_params *params) } } +#if GFX_VER >= 7 + +static void +blorp_get_compute_push_const(struct blorp_batch *batch, + const struct blorp_params *params, + uint32_t threads, + uint32_t *state_offset, + unsigned *state_size) +{ + const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data; + const unsigned push_const_size = + ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64); + assert(cs_prog_data->push.cross_thread.size + + cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs)); + + if (push_const_size == 0) { + *state_offset = 0; + *state_size = 0; + return; + } + + uint32_t push_const_offset; + uint32_t *push_const = + GFX_VERx10 >= 125 ? + blorp_alloc_general_state(batch, push_const_size, 64, + &push_const_offset) : + blorp_alloc_dynamic_state(batch, push_const_size, 64, + &push_const_offset); + memset(push_const, 0x0, push_const_size); + + void *dst = push_const; + const void *src = (char *)¶ms->wm_inputs; + + if (cs_prog_data->push.cross_thread.size > 0) { + memcpy(dst, src, cs_prog_data->push.cross_thread.size); + dst += cs_prog_data->push.cross_thread.size; + src += cs_prog_data->push.cross_thread.size; + } + + assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0); +#if GFX_VERx10 < 125 + if (cs_prog_data->push.per_thread.size > 0) { + for (unsigned t = 0; t < threads; t++) { + memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4); + + uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4; + *subgroup_id = t; + + dst += cs_prog_data->push.per_thread.size; + } + } +#endif + + *state_offset = push_const_offset; + *state_size = push_const_size; +} + +#endif /* GFX_VER >= 7 */ + +static void +blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) +{ + assert(!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)); + assert(params->hiz_op == ISL_AUX_OP_NONE); + +#if GFX_VER >= 7 + + const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data; + const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + const struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(batch->blorp->compiler->devinfo, cs_prog_data, + NULL); + const struct intel_device_info *devinfo = batch->blorp->compiler->devinfo; + + uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0]; + uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1]; + uint32_t group_z0 = params->dst.z_offset; + uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]); + uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]); + assert(params->num_layers >= 1); + uint32_t group_z1 = params->dst.z_offset + params->num_layers; + assert(cs_prog_data->local_size[2] == 1); + +#endif /* GFX_VER >= 7 */ + +#if GFX_VERx10 >= 125 + + blorp_emit(batch, GENX(CFE_STATE), cfe) { + cfe.MaximumNumberofThreads = + devinfo->max_cs_threads * devinfo->subslice_total - 1; + } + + assert(cs_prog_data->push.per_thread.regs == 0); + blorp_emit(batch, GENX(COMPUTE_WALKER), cw) { + cw.SIMDSize = dispatch.simd_size / 16; + cw.LocalXMaximum = cs_prog_data->local_size[0] - 1; + cw.LocalYMaximum = cs_prog_data->local_size[1] - 1; + cw.LocalZMaximum = cs_prog_data->local_size[2] - 1; + cw.ThreadGroupIDStartingX = group_x0; + cw.ThreadGroupIDStartingY = group_y0; + cw.ThreadGroupIDStartingZ = group_z0; + cw.ThreadGroupIDXDimension = group_x1; + cw.ThreadGroupIDYDimension = group_y1; + cw.ThreadGroupIDZDimension = group_z1; + cw.ExecutionMask = 0xffffffff; + + uint32_t surfaces_offset = blorp_setup_binding_table(batch, params); + + uint32_t samplers_offset = + params->src.enabled ? blorp_emit_sampler_state(batch) : 0; + + uint32_t push_const_offset; + unsigned push_const_size; + blorp_get_compute_push_const(batch, params, dispatch.threads, + &push_const_offset, &push_const_size); + cw.IndirectDataStartAddress = push_const_offset; + cw.IndirectDataLength = push_const_size; + + cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + .KernelStartPointer = params->cs_prog_kernel, + .SamplerStatePointer = samplers_offset, + .SamplerCount = params->src.enabled ? 1 : 0, + .BindingTableEntryCount = params->src.enabled ? 2 : 1, + .BindingTablePointer = surfaces_offset, + .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, + .SharedLocalMemorySize = + encode_slm_size(GFX_VER, prog_data->total_shared), + .NumberOfBarriers = cs_prog_data->uses_barrier, + }; + } + +#elif GFX_VER >= 7 + + /* The MEDIA_VFE_STATE documentation for Gfx8+ says: + * + * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless + * the only bits that are changed are scoreboard related: Scoreboard + * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For + * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient." + * + * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL", + * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL. + */ + blorp_emit(batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) { + assert(prog_data->total_scratch == 0); + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * devinfo->subslice_total - 1; + vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0; +#if GFX_VER < 11 + vfe.ResetGatewayTimer = + Resettingrelativetimerandlatchingtheglobaltimestamp; +#endif +#if GFX_VER < 9 + vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol; +#endif +#if GFX_VER == 7 + vfe.GPGPUMode = 1; +#endif + vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0; + + const uint32_t vfe_curbe_allocation = + ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads + + cs_prog_data->push.cross_thread.regs, 2); + vfe.CURBEAllocationSize = vfe_curbe_allocation; + } + + uint32_t push_const_offset; + unsigned push_const_size; + blorp_get_compute_push_const(batch, params, dispatch.threads, + &push_const_offset, &push_const_size); + + blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) { + curbe.CURBETotalDataLength = push_const_size; + curbe.CURBEDataStartAddress = push_const_offset; + } + + uint32_t surfaces_offset = blorp_setup_binding_table(batch, params); + + uint32_t samplers_offset = + params->src.enabled ? blorp_emit_sampler_state(batch) : 0; + + struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = { + .KernelStartPointer = params->cs_prog_kernel, + .SamplerStatePointer = samplers_offset, + .SamplerCount = params->src.enabled ? 1 : 0, + .BindingTableEntryCount = params->src.enabled ? 2 : 1, + .BindingTablePointer = surfaces_offset, + .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, + .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, + .SharedLocalMemorySize = encode_slm_size(GFX_VER, + prog_data->total_shared), + .BarrierEnable = cs_prog_data->uses_barrier, +#if GFX_VER >= 8 || GEN_IS_HASWELL + .CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs, +#endif + }; + + uint32_t idd_offset; + uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); + void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset); + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd); + + blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { + mid.InterfaceDescriptorTotalLength = size; + mid.InterfaceDescriptorDataStartAddress = idd_offset; + } + + blorp_emit(batch, GENX(GPGPU_WALKER), ggw) { + ggw.SIMDSize = dispatch.simd_size / 16; + ggw.ThreadDepthCounterMaximum = 0; + ggw.ThreadHeightCounterMaximum = 0; + ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; + ggw.ThreadGroupIDStartingX = group_x0; + ggw.ThreadGroupIDStartingY = group_y0; +#if GFX_VER >= 8 + ggw.ThreadGroupIDStartingResumeZ = group_z0; +#else + ggw.ThreadGroupIDStartingZ = group_z0; +#endif + ggw.ThreadGroupIDXDimension = group_x1; + ggw.ThreadGroupIDYDimension = group_y1; + ggw.ThreadGroupIDZDimension = group_z1; + ggw.RightExecutionMask = dispatch.right_mask; + ggw.BottomExecutionMask = 0xffffffff; + } + +#else /* GFX_VER >= 7 */ + + unreachable("Compute blorp is not supported on SNB and earlier"); + +#endif /* GFX_VER >= 7 */ +} + + +/** + * \brief Execute a blit or render pass operation. + * + * To execute the operation, this function manually constructs and emits a + * batch to draw a rectangle primitive. The batchbuffer is flushed before + * constructing and after emitting the batch. + * + * This function alters no GL state. + */ +static void +blorp_exec(struct blorp_batch *batch, const struct blorp_params *params) +{ + if (batch->flags & BLORP_BATCH_USE_COMPUTE) + blorp_exec_compute(batch, params); + else + blorp_exec_3d(batch, params); +} + #endif /* BLORP_GENX_EXEC_H */