turnip: Skip load/stores for tiles with no geometry

When HW binning is used tile loads/stores could be skipped
if there is no geometry in the tile.

Loads could be skipped when:
- The attachment won't be resolved, otherwise if load is skipped
  there would be holes in the resolved attachment;
- There is no vkCmdClearAttachments afterwards since it is likely
  a partial clear done via 2d blit (2d blit doesn't produce geometry).

Stores could be skipped when:
- The attachment was not cleared, which may happen by load_op or
  vkCmdClearAttachments;
- When store is not a resolve.

I chose to predicate each load/store separately to allow them to be
skipped when only some attachments are cleared or resolved.

Gmem loads are moved into separate cs because whether to emit
CP_COND_REG_EXEC depends on HW binning being enabled and usage of
vkCmdClearAttachments.

CP_COND_REG_EXEC predicate could be changed during draw_cs only
by perf query, in such case the predicate should be re-emitted.
(At the moment it is always re-emitted before stores)

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15974>
This commit is contained in:
Danylo Piliaiev 2022-04-14 17:19:21 +03:00 committed by Marge Bot
parent d5debf0d8a
commit 0c489f18cb
5 changed files with 182 additions and 17 deletions

View File

@ -2280,6 +2280,8 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
}
}
cmd->state.attachment_cmd_clear[a] = true;
}
/* We may not know the multisample count if there are no attachments, so
@ -2551,6 +2553,8 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
if (a == VK_ATTACHMENT_UNUSED)
continue;
cmd->state.attachment_cmd_clear[a] = true;
tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
&attachments[j].clearValue);
}
@ -2799,24 +2803,64 @@ blit_can_resolve(VkFormat format)
return true;
}
static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
struct tu_cs *cs, bool load)
{
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
}
static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
struct tu_cs *cs, bool load)
{
tu_cond_exec_end(cs);
}
void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
bool cond_exec_allowed,
bool force_load)
{
const struct tu_image_view *iview = cmd->state.attachments[a];
const struct tu_render_pass_attachment *attachment =
&cmd->state.pass->attachments[a];
bool load_common = attachment->load || force_load;
bool load_stencil =
attachment->load_stencil ||
(attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
if (!load_common && !load_stencil)
return;
trace_start_gmem_load(&cmd->trace, cs);
if (attachment->load || force_load)
/* If attachment will be cleared by vkCmdClearAttachments - it is likely
* that it would be partially cleared, and since it is done by 2d blit
* it doesn't produce geometry, so we have to unconditionally load.
*
* To simplify conditions treat partially cleared separate DS as fully
* cleared and don't emit cond_exec.
*/
bool cond_exec = cond_exec_allowed &&
!attachment->clear_mask &&
!cmd->state.attachment_cmd_clear[a] &&
!attachment->will_be_resolved;
if (cond_exec)
tu_begin_load_store_cond_exec(cmd, cs, true);
if (load_common)
tu_emit_blit(cmd, cs, iview, attachment, false, false);
if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
if (load_stencil)
tu_emit_blit(cmd, cs, iview, attachment, false, true);
if (cond_exec)
tu_end_load_store_cond_exec(cmd, cs, true);
trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
}
@ -2919,7 +2963,8 @@ void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
uint32_t gmem_a)
uint32_t gmem_a,
bool cond_exec_allowed)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const VkRect2D *render_area = &cmd->state.render_area;
@ -2930,6 +2975,15 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
if (!dst->store && !dst->store_stencil)
return;
bool was_cleared = src->clear_mask || cmd->state.attachment_cmd_clear[a];
/* Unconditional store should happen only if attachment was cleared,
* which could have happened either by load_op or via vkCmdClearAttachments.
*/
bool cond_exec = cond_exec_allowed && !was_cleared;
if (cond_exec) {
tu_begin_load_store_cond_exec(cmd, cs, false);
}
uint32_t x1 = render_area->offset.x;
uint32_t y1 = render_area->offset.y;
uint32_t x2 = x1 + render_area->extent.width;
@ -2971,6 +3025,10 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
if (store_separate_stencil)
tu_emit_blit(cmd, cs, iview, src, true, true);
if (cond_exec) {
tu_end_load_store_cond_exec(cmd, cs, false);
}
trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
return;
}
@ -3011,5 +3069,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
}
}
if (cond_exec) {
tu_end_load_store_cond_exec(cmd, cs, false);
}
trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
}

View File

@ -632,6 +632,25 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
return use_sysmem;
}
/* Optimization: there is no reason to load gmem if there is no
* geometry to process. COND_REG_EXEC predicate is set here,
* but the actual skip happens in tile_load_cs and tile_store_cs,
* for each blit separately.
*/
static void
tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
uint32_t pipe, uint32_t slot, bool wfm)
{
if (use_hw_binning(cmd)) {
tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) |
A6XX_CP_REG_TEST_0_BIT(slot) |
COND(wfm, A6XX_CP_REG_TEST_0_WAIT_FOR_ME));
} else {
/* COND_REG_EXECs are not emitted in non-binning case */
}
}
static void
tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
@ -664,6 +683,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
tu_cs_emit(cs, pipe * 4);
tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, true);
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
tu_cs_emit(cs, 0x0);
@ -740,6 +761,15 @@ tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
}
}
static void
tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
tu6_emit_blit_scissor(cmd, cs, true);
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
tu_load_gmem_attachment(cmd, cs, i, use_hw_binning(cmd), false);
}
static void
tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
@ -756,7 +786,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
for (uint32_t a = 0; a < pass->attachment_count; ++a) {
if (pass->attachments[a].gmem_offset >= 0)
tu_store_gmem_attachment(cmd, cs, a, a);
tu_store_gmem_attachment(cmd, cs, a, a, use_hw_binning(cmd));
}
if (subpass->resolve_attachments) {
@ -764,7 +794,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
uint32_t a = subpass->resolve_attachments[i].attachment;
if (a != VK_ATTACHMENT_UNUSED) {
uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
tu_store_gmem_attachment(cmd, cs, a, gmem_a);
tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);
}
}
}
@ -1220,11 +1250,6 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
tu6_emit_blit_scissor(cmd, cs, true);
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
tu_load_gmem_attachment(cmd, cs, i, false);
tu6_emit_blit_scissor(cmd, cs, false);
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
@ -1356,8 +1381,10 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
}
static void
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
uint32_t pipe, uint32_t slot)
{
tu_cs_emit_call(cs, &cmd->tile_load_cs);
tu_cs_emit_call(cs, &cmd->draw_cs);
if (use_hw_binning(cmd)) {
@ -1365,6 +1392,10 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
}
/* Predicate is changed in draw_cs so we have to re-emit it */
if (cmd->state.draw_cs_writes_to_cond_pred)
tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
tu_cs_emit_call(cs, &cmd->tile_store_cs);
if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) {
@ -1418,7 +1449,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
tu6_render_tile(cmd, &cmd->cs);
tu6_render_tile(cmd, &cmd->cs, pipe, slot);
trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs);
}
}
@ -1491,6 +1522,7 @@ tu_create_cmd_buffer(struct tu_device *device,
list_inithead(&cmd_buffer->renderpass_autotune_results);
tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->tile_load_cs, device, TU_CS_MODE_GROW, 2048);
tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
@ -1507,11 +1539,14 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
list_del(&cmd_buffer->pool_link);
tu_cs_finish(&cmd_buffer->cs);
tu_cs_finish(&cmd_buffer->tile_load_cs);
tu_cs_finish(&cmd_buffer->draw_cs);
tu_cs_finish(&cmd_buffer->tile_store_cs);
tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
tu_cs_finish(&cmd_buffer->sub_cs);
vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear);
u_trace_fini(&cmd_buffer->trace);
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
@ -1535,11 +1570,15 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
cmd_buffer->record_result = VK_SUCCESS;
tu_cs_reset(&cmd_buffer->cs);
tu_cs_reset(&cmd_buffer->tile_load_cs);
tu_cs_reset(&cmd_buffer->draw_cs);
tu_cs_reset(&cmd_buffer->tile_store_cs);
tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
tu_cs_reset(&cmd_buffer->sub_cs);
vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear);
cmd_buffer->state.attachment_cmd_clear = NULL;
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
@ -1678,6 +1717,7 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
cmd_buffer->usage_flags = pBeginInfo->flags;
tu_cs_begin(&cmd_buffer->cs);
tu_cs_begin(&cmd_buffer->tile_load_cs);
tu_cs_begin(&cmd_buffer->draw_cs);
tu_cs_begin(&cmd_buffer->tile_store_cs);
tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
@ -1710,6 +1750,14 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
cmd_buffer->state.subpass =
&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
/* vkCmdClearAttachments is allowed in a secondary cmdbuf and we have to
* track it as in primary cmdbuf.
*/
cmd_buffer->state.attachment_cmd_clear =
vk_zalloc(&cmd_buffer->pool->vk.alloc,
cmd_buffer->state.pass->attachment_count *
sizeof(cmd_buffer->state.attachment_cmd_clear[0]),
8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
} else {
/* When executing in the middle of another command buffer, the CCU
* state is unknown.
@ -2245,6 +2293,7 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
}
tu_cs_end(&cmd_buffer->cs);
tu_cs_end(&cmd_buffer->tile_load_cs);
tu_cs_end(&cmd_buffer->draw_cs);
tu_cs_end(&cmd_buffer->tile_store_cs);
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
@ -3061,7 +3110,7 @@ vk2tu_src_stage(VkPipelineStageFlags vk_stages)
{
enum tu_stage stage = TU_STAGE_CP;
u_foreach_bit (bit, vk_stages) {
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
stage = MAX2(stage, new_stage);
}
@ -3073,7 +3122,7 @@ vk2tu_dst_stage(VkPipelineStageFlags vk_stages)
{
enum tu_stage stage = TU_STAGE_PS;
u_foreach_bit (bit, vk_stages) {
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
stage = MIN2(stage, new_stage);
}
@ -3130,6 +3179,14 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
cmd->state.has_subpass_predication = true;
if (secondary->state.disable_gmem)
cmd->state.disable_gmem = true;
cmd->state.draw_cs_writes_to_cond_pred |=
secondary->state.draw_cs_writes_to_cond_pred;
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; i++) {
cmd->state.attachment_cmd_clear[i] |=
secondary->state.attachment_cmd_clear[i];
}
} else {
assert(tu_cs_is_empty(&secondary->draw_cs));
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
@ -3307,6 +3364,18 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
return;
}
cmd->state.attachment_cmd_clear =
vk_zalloc(&cmd->pool->vk.alloc, pass->attachment_count *
sizeof(cmd->state.attachment_cmd_clear[0]), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!cmd->state.attachment_cmd_clear) {
cmd->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
return;
}
cmd->state.draw_cs_writes_to_cond_pred = false;
for (unsigned i = 0; i < pass->attachment_count; i++) {
cmd->state.attachments[i] = pAttachmentInfo ?
tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
@ -3400,7 +3469,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
tu_store_gmem_attachment(cmd, cs, a, gmem_a);
tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);
if (pass->attachments[a].gmem_offset < 0)
continue;
@ -3410,7 +3479,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
* if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
*/
tu_finishme("missing GMEM->GMEM resolve path\n");
tu_load_gmem_attachment(cmd, cs, a, true);
tu_load_gmem_attachment(cmd, cs, a, false, true);
}
}
@ -4627,8 +4696,15 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
/* GMEM loads are created after draw_cs in the separate cs
* because they need to know whether to allow their conditional
* execution, which is tied to a state that is known only at
* the end of the renderpass.
*/
tu6_emit_tile_load(cmd_buffer, &cmd_buffer->tile_load_cs);
tu6_emit_tile_store(cmd_buffer, &cmd_buffer->tile_store_cs);
tu_cs_end(&cmd_buffer->tile_load_cs);
tu_cs_end(&cmd_buffer->draw_cs);
tu_cs_end(&cmd_buffer->tile_store_cs);
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
@ -4649,6 +4725,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
/* discard draw_cs and draw_epilogue_cs entries now that the tiles are
rendered */
tu_cs_discard_entries(&cmd_buffer->tile_load_cs);
tu_cs_begin(&cmd_buffer->tile_load_cs);
tu_cs_discard_entries(&cmd_buffer->draw_cs);
tu_cs_begin(&cmd_buffer->draw_cs);
tu_cs_discard_entries(&cmd_buffer->tile_store_cs);
@ -4661,6 +4739,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments);
vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear);
cmd_buffer->state.attachment_cmd_clear = NULL;
cmd_buffer->state.pass = NULL;
cmd_buffer->state.subpass = NULL;

View File

@ -800,6 +800,12 @@ tu_CreateRenderPass2(VkDevice _device,
for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
subpass->resolve_attachments[j].attachment =
desc->pResolveAttachments[j].attachment;
uint32_t src_a = desc->pColorAttachments[j].attachment;
if (src_a != VK_ATTACHMENT_UNUSED) {
pass->attachments[src_a].will_be_resolved =
desc->pResolveAttachments[j].attachment != VK_ATTACHMENT_UNUSED;
}
}
}
@ -808,6 +814,11 @@ tu_CreateRenderPass2(VkDevice _device,
subpass->resolve_count++;
uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
subpass->resolve_attachments[subpass->resolve_count - 1].attachment = a;
uint32_t src_a = desc->pDepthStencilAttachment->attachment;
if (src_a != VK_ATTACHMENT_UNUSED) {
pass->attachments[src_a].will_be_resolved = a != VK_ATTACHMENT_UNUSED;
}
}
uint32_t a = desc->pDepthStencilAttachment ?

View File

@ -1196,6 +1196,10 @@ struct tu_cmd_state
VkRect2D render_area;
const struct tu_image_view **attachments;
/* Tracks whether attachment was cleared by vkCmdClearAttachments */
bool *attachment_cmd_clear;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
bool xfb_used;
bool has_tess;
@ -1290,6 +1294,7 @@ struct tu_cmd_buffer
VkResult record_result;
struct tu_cs cs;
struct tu_cs tile_load_cs;
struct tu_cs draw_cs;
struct tu_cs tile_store_cs;
struct tu_cs draw_epilogue_cs;
@ -1576,6 +1581,7 @@ void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
bool cond_exec_allowed,
bool force_load);
/* expose this function to be able to emit load without checking LOAD_OP */
@ -1587,7 +1593,8 @@ void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
uint32_t gmem_a);
uint32_t gmem_a,
bool cond_exec_allowed);
enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
@ -1857,6 +1864,7 @@ struct tu_render_pass_attachment
bool load;
bool store;
int32_t gmem_offset;
bool will_be_resolved;
/* for D32S8 separate stencil: */
bool load_stencil;
bool store_stencil;

View File

@ -874,6 +874,10 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
uint32_t last_pass = ~0;
if (cmdbuf->state.pass) {
cmdbuf->state.draw_cs_writes_to_cond_pred = true;
}
/* Querying perf counters happens in these steps:
*
* 0) There's a scratch reg to set a pass index for perf counters query.