tu: Overhaul LRZ, implement on-GPU dir tracking and LRZ fast-clear

On-GPU LRZ direction tracking allows LRZ to support secondary cmdbufs,
reusing LRZ between renderpasses, and in future to support LRZ when
VK_KHR_dynamic_rendering is used.

With on-gpu tracking we have to be careful keeping LRZ state in sync
with underlying depth image, which means we should invalidate LRZ
when underlying image is changed or the view of image is different
from previous renderpass.

All of this resulted in LRZ logic being thinly spread through the code,
making it hard to understand. So most of it was moved to tu_lrz.c.

For more details on past and new LRZ features see comment at the
top of tu_lrz.c.

Note about blob:
- Blob is much more happy to do LRZ_FLUSH, it flushes at the start
  of the renderpass, after binning, and at the end of the renderpass.
- Blob seem not to care about changes in depth image done via
  vkCmdCopyImage.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6347

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16251>
This commit is contained in:
Danylo Piliaiev 2022-05-16 13:41:02 +03:00 committed by Marge Bot
parent 70f1d70ddd
commit 4b5f0d98fd
12 changed files with 987 additions and 298 deletions

View File

@ -133,6 +133,10 @@ struct fd_dev_info {
bool has_dp2acc;
bool has_dp4acc;
bool enable_lrz_fast_clear;
bool has_lrz_dir_tracking;
bool lrz_track_quirk;
struct {
uint32_t RB_UNKNOWN_8E04_blit;
uint32_t PC_POWER_CNTL;

View File

@ -138,6 +138,10 @@ class A6xxGPUInfo(GPUInfo):
self.a6xx.has_cp_reg_write = True
self.a6xx.has_8bpp_ubwc = True
# All a6xx gens support lrz fast-clear, however newer blob driver
# (v615) doesn't use it for gen1 and gen2.
self.a6xx.enable_lrz_fast_clear = True
for name, val in template.items():
if name == "magic": # handled above
continue
@ -245,6 +249,8 @@ a6xx_gen3 = dict(
has_ccu_flush_bug = True,
has_8bpp_ubwc = False,
has_dp2acc = True,
has_lrz_dir_tracking = True,
lrz_track_quirk = True,
magic = dict(
# this seems to be a chicken bit that fixes cubic filtering:
TPL1_DBG_ECO_CNTL = 0x1000000,
@ -271,6 +277,7 @@ a6xx_gen4 = dict(
has_getfiberid = True,
has_dp2acc = True,
has_dp4acc = True,
has_lrz_dir_tracking = True,
magic = dict(
TPL1_DBG_ECO_CNTL = 0x5008000,
),

View File

@ -316,6 +316,15 @@ fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_PITCH(ubwc_pitch) |
A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_ARRAY_PITCH(layout->ubwc_layer_size >> 2);
const struct util_format_description *format_desc =
util_format_description(args->format);
if (util_format_has_depth(format_desc)) {
view->GRAS_LRZ_DEPTH_VIEW =
A6XX_GRAS_LRZ_DEPTH_VIEW_BASE_LAYER(args->base_array_layer) |
A6XX_GRAS_LRZ_DEPTH_VIEW_LAYER_COUNT(args->layer_count) |
A6XX_GRAS_LRZ_DEPTH_VIEW_BASE_MIP_LEVEL(args->base_miplevel);
}
view->base_addr = base_addr;
view->ubwc_addr = ubwc_addr;
view->layer_size = layer_size;

View File

@ -313,6 +313,8 @@ struct fdl6_view {
uint32_t RB_2D_DST_INFO;
uint32_t RB_BLIT_DST_INFO;
uint32_t GRAS_LRZ_DEPTH_VIEW;
};
void

View File

@ -1785,6 +1785,11 @@ to upconvert to 32b float internally?
<bitfield name="LRZ_WRITE" pos="1" type="boolean"/>
<doc>update MAX instead of MIN value, ie. GL_GREATER/GL_GEQUAL</doc>
<bitfield name="GREATER" pos="2" type="boolean"/>
<doc>
Clears the LRZ block being touched to:
- 0.0 if GREATER
- 1.0 if LESS
</doc>
<bitfield name="FC_ENABLE" pos="3" type="boolean"/>
<!-- set when depth-test + depth-write enabled -->
<bitfield name="Z_TEST_ENABLE" pos="4" type="boolean"/>
@ -1857,6 +1862,14 @@ to upconvert to 32b float internally?
<reg32 offset="0x8109" name="GRAS_SAMPLE_CNTL">
<bitfield name="PER_SAMP_MODE" pos="0" type="boolean"/>
</reg32>
<!--
LRZ buffer represents a single array layer + mip level, and there is
a single buffer per depth image. Thus to reuse LRZ between renderpasses
it is necessary to track the depth view used in the past renderpass, which
GRAS_LRZ_DEPTH_VIEW is for.
GRAS_LRZ_CNTL checks if current value of GRAS_LRZ_DEPTH_VIEW is equal to
the value stored in the LRZ buffer, if not - LRZ is disabled.
-->
<reg32 offset="0x810a" name="GRAS_LRZ_DEPTH_VIEW">
<bitfield name="BASE_LAYER" low="0" high="10" type="uint"/>
<bitfield name="LAYER_COUNT" low="16" high="26" type="uint"/>

View File

@ -41,6 +41,7 @@ libtu_files = files(
'tu_descriptor_set.h',
'tu_formats.c',
'tu_image.c',
'tu_lrz.c',
'tu_nir_lower_multiview.c',
'tu_pass.c',
'tu_pipeline.c',

View File

@ -1304,6 +1304,15 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
{
const struct blit_ops *ops = &r2d_ops;
/* It is assumed that LRZ cache is invalidated at this point for
* the writes here to become visible to LRZ.
*
* LRZ writes are going through UCHE cache, flush UCHE before changing
* LRZ via CCU. Don't need to invalidate CCU since we are presumably
* writing whole cache lines we assume to be 64 bytes.
*/
tu6_emit_event_write(cmd, &cmd->cs, CACHE_FLUSH_TS);
ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
VK_SAMPLE_COUNT_1_BIT);
ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value);
@ -1313,6 +1322,32 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
ops->run(cmd, cs);
ops->teardown(cmd, cs);
/* Clearing writes via CCU color in the PS stage, and LRZ is read via
* UCHE in the earlier GRAS stage.
*/
cmd->state.cache.flush_bits |=
TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
TU_CMD_FLAG_WAIT_FOR_IDLE;
}
void
tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_image *image)
{
const struct blit_ops *ops = &r2d_ops;
VkClearValue clear = { .color = { .uint32[0] = 0xffffffff } };
/* LRZ fast-clear buffer is always allocated with 512 bytes size. */
ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
VK_SAMPLE_COUNT_1_BIT);
ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &clear);
ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
image->iova + image->lrz_fc_offset, 512);
ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {128, 1});
ops->run(cmd, cs);
ops->teardown(cmd, cs);
}
static void
@ -1536,6 +1571,10 @@ tu_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
tu6_blit_image(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
pBlitImageInfo->filter);
}
if (dst_image->lrz_height) {
tu_disable_lrz(cmd, &cmd->cs, dst_image);
}
}
static void
@ -1640,6 +1679,10 @@ tu_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
tu_copy_buffer_to_image(cmd, src_buffer, dst_image,
pCopyBufferToImageInfo->pRegions + i);
if (dst_image->lrz_height) {
tu_disable_lrz(cmd, &cmd->cs, dst_image);
}
}
static void
@ -1954,6 +1997,10 @@ tu_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
tu_copy_image_to_image(cmd, src_image, dst_image,
pCopyImageInfo->pRegions + i);
}
if (dst_image->lrz_height) {
tu_disable_lrz(cmd, &cmd->cs, dst_image);
}
}
static void
@ -2284,6 +2331,8 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
}
tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges);
}
static void
@ -2643,8 +2692,8 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
for (uint32_t j = 0; j < attachmentCount; j++) {
if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
continue;
cmd->state.lrz.valid = false;
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
tu_lrz_disable_during_renderpass(cmd);
}
/* vkCmdClearAttachments is supposed to respect the predicate if active.

View File

@ -218,11 +218,6 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
tu_cs_emit_regs(cs,
A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
tu_cs_emit_regs(cs,
A6XX_GRAS_LRZ_BUFFER_BASE(0),
A6XX_GRAS_LRZ_BUFFER_PITCH(0),
A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
return;
@ -247,10 +242,6 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
tu_cs_image_flag_ref(cs, &iview->view, 0);
tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.qword = iview->image->iova + iview->image->lrz_offset),
A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch),
A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
attachment->format == VK_FORMAT_S8_UINT) {
@ -1243,6 +1234,7 @@ tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *sub
tu_emit_input_attachments(cmd, subpass, false));
}
static void
tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
const VkRenderPassBeginInfo *info)
@ -1274,14 +1266,15 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
tu_lrz_sysmem_begin(cmd, cs);
assert(fb->width > 0 && fb->height > 0);
tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
tu6_emit_window_offset(cs, 0, 0);
tu6_emit_bin_size(cs, 0, 0,
A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM));
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM) |
A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
@ -1318,7 +1311,7 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x0);
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
tu_lrz_sysmem_end(cmd, cs);
tu_cs_sanity_check(cs);
}
@ -1329,7 +1322,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
tu_lrz_tiling_begin(cmd, cs);
tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
tu_cs_emit(cs, 0x0);
@ -1424,10 +1417,7 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
tu_cs_emit_regs(cs,
A6XX_GRAS_LRZ_CNTL(0));
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
tu_lrz_tiling_end(cmd, cs);
tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
@ -1770,9 +1760,13 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
}
if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
TU_FROM_HANDLE(tu_framebuffer, fb, pBeginInfo->pInheritanceInfo->framebuffer);
cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
cmd_buffer->state.subpass =
&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
tu_lrz_begin_secondary_cmdbuf(cmd_buffer, fb);
} else {
/* When executing in the middle of another command buffer, the CCU
* state is unknown.
@ -3359,6 +3353,11 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
cmd->state.draw_cs_writes_to_cond_pred |=
secondary->state.draw_cs_writes_to_cond_pred;
/* If LRZ was made invalid in secondary - we should disable
* LRZ retroactively for the whole renderpass.
*/
if (!secondary->state.lrz.valid)
cmd->state.lrz.valid = false;
} else {
assert(tu_cs_is_empty(&secondary->draw_cs));
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
@ -3370,7 +3369,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
}
cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
if (cmd->state.pass) {
if (!cmd->state.lrz.gpu_dir_tracking && cmd->state.pass) {
/* After a secondary command buffer is executed, LRZ is not valid
* until it is cleared again.
*/
@ -3577,31 +3576,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
if (pass->subpasses[0].feedback_invalidate)
cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
/* Track LRZ valid state */
uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
if (a != VK_ATTACHMENT_UNUSED) {
const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
struct tu_image *image = cmd->state.attachments[a]->image;
/* if image has lrz and it isn't a stencil-only clear: */
if (image->lrz_height &&
(att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) {
cmd->state.lrz.image = image;
cmd->state.lrz.valid = true;
cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);
/* Clearing writes via CCU color in the PS stage, and LRZ is read via
* UCHE in the earlier GRAS stage.
*/
cmd->state.cache.flush_bits |=
TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
TU_CMD_FLAG_WAIT_FOR_IDLE;
} else {
cmd->state.lrz.valid = false;
}
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
}
tu_lrz_begin_renderpass(cmd, pRenderPassBegin);
cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
@ -3755,253 +3730,6 @@ tu6_emit_consts_geom(struct tu_cmd_buffer *cmd,
return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
}
/* update lrz state based on stencil-test func:
*
* Conceptually the order of the pipeline is:
*
*
* FS -> Alpha-Test -> Stencil-Test -> Depth-Test
* | |
* if wrmask != 0 if wrmask != 0
* | |
* v v
* Stencil-Write Depth-Write
*
* Because Stencil-Test can have side effects (Stencil-Write) prior
* to depth test, in this case we potentially need to disable early
* lrz-test. See:
*
* https://www.khronos.org/opengl/wiki/Per-Sample_Processing
*/
static void
tu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
VkCompareOp func,
bool stencil_write,
bool *invalidate_lrz)
{
switch (func) {
case VK_COMPARE_OP_ALWAYS:
/* nothing to do for LRZ, but for stencil test when stencil-
* write is enabled, we need to disable lrz-test, since
* conceptually stencil test and write happens before depth-test.
*/
if (stencil_write) {
gras_lrz_cntl->enable = false;
gras_lrz_cntl->z_test_enable = false;
*invalidate_lrz = true;
}
break;
case VK_COMPARE_OP_NEVER:
/* fragment never passes, disable lrz_write for this draw. */
gras_lrz_cntl->lrz_write = false;
break;
default:
/* whether the fragment passes or not depends on result
* of stencil test, which we cannot know when doing binning
* pass.
*/
gras_lrz_cntl->lrz_write = false;
/* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
* effects from stencil test we need to disable lrz-test.
*/
if (stencil_write) {
gras_lrz_cntl->enable = false;
gras_lrz_cntl->z_test_enable = false;
*invalidate_lrz = true;
}
break;
}
}
static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
const uint32_t a)
{
struct tu_pipeline *pipeline = cmd->state.pipeline;
bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
/* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth
* or early fragment tests.
*/
if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) {
cmd->state.lrz.valid = false;
return gras_lrz_cntl;
}
/* If depth test is disabled we shouldn't touch LRZ.
* Same if there is no depth attachment.
*/
if (a == VK_ATTACHMENT_UNUSED || !z_test_enable ||
(cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ))
return gras_lrz_cntl;
if (!cmd->state.attachments) {
/* Secondary cmdbuf - there is nothing we could do. */
return gras_lrz_cntl;
}
gras_lrz_cntl.enable = z_test_enable;
gras_lrz_cntl.lrz_write =
z_write_enable &&
!(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE);
gras_lrz_cntl.z_test_enable = z_read_enable;
gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
/* See comment in tu_pipeline about disabling LRZ write for blending. */
if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)
gras_lrz_cntl.lrz_write = false;
if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) &&
cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts))
gras_lrz_cntl.lrz_write = false;
/* LRZ is disabled until it is cleared, which means that one "wrong"
* depth test or shader could disable LRZ until depth buffer is cleared.
*/
bool disable_lrz = false;
bool temporary_disable_lrz = false;
/* If Z is not written - it doesn't affect LRZ buffer state.
* Which means two things:
* - Don't lock direction until Z is written for the first time;
* - If Z isn't written and direction IS locked it's possible to just
* temporary disable LRZ instead of fully bailing out, when direction
* is changed.
*/
enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
switch (depth_compare_op) {
case VK_COMPARE_OP_ALWAYS:
case VK_COMPARE_OP_NOT_EQUAL:
/* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
* so if there is a depth write - LRZ must be disabled.
*/
if (z_write_enable) {
perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
disable_lrz = true;
} else {
perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
temporary_disable_lrz = true;
}
break;
case VK_COMPARE_OP_EQUAL:
case VK_COMPARE_OP_NEVER:
/* Blob disables LRZ for OP_EQUAL, and from our empirical
* evidence it is a right thing to do.
*
* Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
* we could just temporary disable LRZ.
*/
temporary_disable_lrz = true;
break;
case VK_COMPARE_OP_GREATER:
case VK_COMPARE_OP_GREATER_OR_EQUAL:
lrz_direction = TU_LRZ_GREATER;
gras_lrz_cntl.greater = true;
break;
case VK_COMPARE_OP_LESS:
case VK_COMPARE_OP_LESS_OR_EQUAL:
lrz_direction = TU_LRZ_LESS;
gras_lrz_cntl.greater = false;
break;
default:
unreachable("bad VK_COMPARE_OP value or uninitialized");
break;
};
/* If depthfunc direction is changed, bail out on using LRZ. The
* LRZ buffer encodes a min/max depth value per block, but if
* we switch from GT/GE <-> LT/LE, those values cannot be
* interpreted properly.
*/
if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
lrz_direction != TU_LRZ_UNKNOWN &&
cmd->state.lrz.prev_direction != lrz_direction) {
if (z_write_enable) {
perf_debug(cmd->device, "Invalidating LRZ due to direction change");
disable_lrz = true;
} else {
perf_debug(cmd->device, "Skipping LRZ due to direction change");
temporary_disable_lrz = true;
}
}
/* Consider the following sequence of depthfunc changes:
*
* - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
* LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
* during second VK_COMPARE_OP_GREATER.
*
* - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
* Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
* invalid during COMPARE_OP_LESS.
*
* This shows that we should keep last KNOWN direction.
*/
if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
cmd->state.lrz.prev_direction = lrz_direction;
/* Invalidate LRZ and disable write if stencil test is enabled */
bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
if (stencil_test_enable) {
bool stencil_front_writemask =
(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
(cmd->state.dynamic_stencil_wrmask & 0xff) :
(pipeline->stencil_wrmask & 0xff);
bool stencil_back_writemask =
(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
(pipeline->stencil_wrmask & 0xff00) >> 8;
VkCompareOp stencil_front_compare_op =
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
VkCompareOp stencil_back_compare_op =
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op,
stencil_front_writemask, &disable_lrz);
tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op,
stencil_back_writemask, &disable_lrz);
}
if (disable_lrz)
cmd->state.lrz.valid = false;
if (temporary_disable_lrz)
gras_lrz_cntl.enable = false;
cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
if (!cmd->state.lrz.enabled)
memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
return gras_lrz_cntl;
}
static void
tu6_build_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(
.enable = gras_lrz_cntl.enable,
.greater = gras_lrz_cntl.greater,
.lrz_write = gras_lrz_cntl.lrz_write,
.z_test_enable = gras_lrz_cntl.z_test_enable,
.z_bounds_enable = gras_lrz_cntl.z_bounds_enable));
tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
}
static bool
tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)
{
@ -4186,7 +3914,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
struct tu_cs cs;
cmd->state.lrz_and_depth_plane_state =
tu_cs_draw_state(&cmd->sub_cs, &cs, 8);
tu6_build_lrz(cmd, &cs);
tu6_emit_lrz(cmd, &cs);
tu6_build_depth_plane_z_mode(cmd, &cs);
}

View File

@ -360,6 +360,7 @@ static const struct debug_control tu_debug_options[] = {
{ "noubwc", TU_DEBUG_NOUBWC },
{ "nomultipos", TU_DEBUG_NOMULTIPOS },
{ "nolrz", TU_DEBUG_NOLRZ },
{ "nolrzfc", TU_DEBUG_NOLRZFC },
{ "perf", TU_DEBUG_PERF },
{ "perfc", TU_DEBUG_PERFC },
{ "flushall", TU_DEBUG_FLUSHALL },

View File

@ -568,6 +568,35 @@ tu_image_init(struct tu_device *device, struct tu_image *image,
image->lrz_offset = image->total_size;
unsigned lrz_size = lrz_pitch * lrz_height * 2;
image->total_size += lrz_size;
unsigned nblocksx = DIV_ROUND_UP(DIV_ROUND_UP(width, 8), 16);
unsigned nblocksy = DIV_ROUND_UP(DIV_ROUND_UP(height, 8), 4);
/* Fast-clear buffer is 1bit/block */
image->lrz_fc_size = DIV_ROUND_UP(nblocksx * nblocksy, 8);
/* Fast-clear buffer cannot be larger than 512 bytes (HW limitation) */
bool has_lrz_fc = image->lrz_fc_size <= 512 &&
device->physical_device->info->a6xx.enable_lrz_fast_clear &&
!unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_NOLRZFC);
if (has_lrz_fc || device->physical_device->info->a6xx.has_lrz_dir_tracking) {
image->lrz_fc_offset = image->total_size;
image->total_size += 512;
if (device->physical_device->info->a6xx.has_lrz_dir_tracking) {
/* Direction tracking uses 1 byte */
image->total_size += 1;
/* GRAS_LRZ_DEPTH_VIEW needs 5 bytes: 4 for view data and 1 for padding */
image->total_size += 5;
}
}
if (!has_lrz_fc) {
image->lrz_fc_size = 0;
}
} else {
image->lrz_height = 0;
}
return VK_SUCCESS;

View File

@ -0,0 +1,796 @@
/*
* Copyright © 2022 Igalia S.L.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "tu_private.h"
#include "tu_cs.h"
/* Low-resolution Z buffer is very similar to a depth prepass that helps
* the HW avoid executing the fragment shader on those fragments that will
* be subsequently discarded by the depth test afterwards.
*
* The interesting part of this feature is that it allows applications
* to submit the vertices in any order.
*
* In the binning pass it is possible to store the depth value of each
* vertex into internal low resolution depth buffer and quickly test
* the primitives against it during the render pass.
*
* There are a number of limitations when LRZ cannot be used:
* - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
* - Writing to stencil buffer
* - Writing depth while:
* - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
* - Using OP_ALWAYS or OP_NOT_EQUAL;
* - Clearing depth with vkCmdClearAttachments;
* - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
* - (pre-a650) Using secondary command buffers;
* - Sysmem rendering (with small caveat).
*
* Pre-a650 (before gen3)
* ======================
*
* The direction is fully tracked on CPU. In renderpass LRZ starts with
* unknown direction, the direction is set first time when depth write occurs
* and if it does change afterwards - direction becomes invalid and LRZ is
* disabled for the rest of the renderpass.
*
* Since direction is not tracked by GPU - it's impossible to know whether
* LRZ is enabled during construction of secondary command buffers.
*
* For the same reason it's impossible to reuse LRZ between renderpasses.
*
* A650+ (gen3+)
* =============
*
* Now LRZ direction could be tracked on GPU. There are to parts:
* - Direction byte which stores current LRZ direction;
* - Parameters of the last used depth view.
*
* The idea is the same as when LRZ tracked on CPU: when GRAS_LRZ_CNTL
* is used - its direction is compared to previously known direction
* and direction byte is set to disabled when directions are incompatible.
*
* Additionally, to reuse LRZ between renderpasses, GRAS_LRZ_CNTL checks
* if current value of GRAS_LRZ_DEPTH_VIEW is equal to the value
* stored in the buffer, if not - LRZ is disabled. (This is necessary
* because depth buffer may have several layers and mip levels, on the
* other hand LRZ buffer represents only a single layer + mip level).
*
* LRZ direction between renderpasses is disabled when underlying depth
* buffer is changed, the following commands could change depth image:
* - vkCmdBlitImage*
* - vkCmdCopyBufferToImage*
* - vkCmdCopyImage*
*
* LRZ Fast-Clear
* ==============
*
* The LRZ fast-clear buffer is initialized to zeroes and read/written
* when GRAS_LRZ_CNTL.FC_ENABLE (b3) is set. It appears to store 1b/block.
* '0' means block has original depth clear value, and '1' means that the
* corresponding block in LRZ has been modified.
*
* LRZ fast-clear conservatively clears LRZ buffer, at the point where LRZ is
* written the LRZ block which corresponds to a single fast-clear bit is cleared:
* - To 0.0 if depth comparison is GREATER;
* - To 1.0 if depth comparison is LESS;
*
* This way it's always valid to fast-clear. On the other hand we disable
* fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
* for perf if some primitives are expected to fail depth test against the
* actual depth clear value.
*
* LRZ Precision
* =============
*
* LRZ always uses Z16_UNORM. The epsilon for it is 1.f / (1 << 16) which is
* not enough to represent all values of Z32_UNORM or Z32_FLOAT.
* This especially rises questions in context of fast-clear, if fast-clear
* uses a value which cannot be precisely represented by LRZ - we wouldn't
* be able to round it in the correct direction since direction is tracked
* on GPU.
*
* However, it seems that depth comparisons with LRZ values have some "slack"
* and nothing special should be done for such depth clear values.
*
* How it was tested:
* - Clear Z32_FLOAT attachment to 1.f / (1 << 17)
* - LRZ buffer contains all zeroes
* - Do draws and check whether all samples are passing:
* - OP_GREATER with (1.f / (1 << 17) + float32_epsilon) - passing;
* - OP_GREATER with (1.f / (1 << 17) - float32_epsilon) - not passing;
* - OP_LESS with (1.f / (1 << 17) - float32_epsilon) - samples;
* - OP_LESS with() 1.f / (1 << 17) + float32_epsilon) - not passing;
* - OP_LESS_OR_EQ with (1.f / (1 << 17) + float32_epsilon) - not passing;
* In all cases resulting LRZ buffer is all zeroes and LRZ direction is updated.
*
* LRZ Caches
* ==========
*
* ! The policy here is to flush LRZ cache right after it is changed,
* so if LRZ data is needed afterwards - there is no need to flush it
* before using LRZ.
*
* LRZ_FLUSH flushes and invalidates LRZ caches, there are two caches:
* - Cache for fast-clear buffer;
* - Cache for direction byte + depth view params.
* They could be cleared by LRZ_CLEAR. To become visible in GPU memory
* the caches should be flushed with LRZ_FLUSH afterwards.
*
* GRAS_LRZ_CNTL reads from these caches.
*/
static void
tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
{
if (!depth_image) {
tu_cs_emit_regs(cs,
A6XX_GRAS_LRZ_BUFFER_BASE(0),
A6XX_GRAS_LRZ_BUFFER_PITCH(0),
A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
return;
}
uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
if (!depth_image->lrz_fc_offset)
lrz_fc_iova = 0;
tu_cs_emit_regs(cs,
A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
}
static void
tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_reg_value reg)
{
if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
tu_cs_emit(cs, reg.reg);
tu_cs_emit(cs, reg.value);
} else {
tu_cs_emit_pkt4(cs, reg.reg, 1);
tu_cs_emit(cs, reg.value);
}
}
static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
/* Disable direction by writing invalid depth view. */
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
.base_layer = 0b11111111111,
.layer_count = 0b11111111111,
.base_mip_level = 0b1111,
));
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
.enable = true,
.disable_on_wrong_dir = true,
));
tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
}
static void
tu_lrz_init_state(struct tu_cmd_buffer *cmd,
const struct tu_render_pass_attachment *att,
const struct tu_image_view *view)
{
if (!view->image->lrz_height)
return;
bool clears_depth = att->clear_mask &
(VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
bool has_gpu_tracking =
cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
if (!has_gpu_tracking && !clears_depth)
return;
if (!clears_depth && !att->load)
return;
cmd->state.lrz.image_view = view;
cmd->state.lrz.valid = true;
cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
/* Be optimistic and unconditionally enable fast-clear in
* secondary cmdbufs and when reusing previous LRZ state.
*/
cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0;
cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
cmd->state.lrz.reuse_previous_state = !clears_depth;
}
void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
const VkRenderPassBeginInfo *pRenderPassBegin)
{
const struct tu_render_pass *pass = cmd->state.pass;
int lrz_img_count = 0;
for (unsigned i = 0; i < pass->attachment_count; i++) {
if (cmd->state.attachments[i]->image->lrz_height)
lrz_img_count++;
}
if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
/* Theoretically we could switch between LRZ buffers during the binning
* and tiling passes, but it is untested and would add complexity for
* presumably extremely rare case.
*/
perf_debug(cmd->device,
"Invalidating LRZ because there are several subpasses with "
"different depth attachments in a single renderpass");
for (unsigned i = 0; i < pass->attachment_count; i++) {
struct tu_image *image = cmd->state.attachments[i]->image;
tu_disable_lrz(cmd, &cmd->cs, image);
}
}
/* Track LRZ valid state */
cmd->state.lrz.valid = false;
uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
if (a != VK_ATTACHMENT_UNUSED) {
const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
VkClearValue clear = pRenderPassBegin->pClearValues[a];
cmd->state.lrz.depth_clear_value = clear;
cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
(clear.depthStencil.depth == 0.f ||
clear.depthStencil.depth == 1.f);
}
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
}
if (!cmd->state.lrz.valid) {
memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
tu6_emit_lrz_buffer(&cmd->cs, NULL);
}
}
void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd,
struct tu_framebuffer *fb)
{
uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
if (a != VK_ATTACHMENT_UNUSED &&
cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
struct tu_image_view *view = fb->attachments[a].attachment;
tu_lrz_init_state(cmd, att, view);
}
}
void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
if (!cmd->state.lrz.image_view)
return;
struct tu_lrz_state *lrz = &cmd->state.lrz;
tu6_emit_lrz_buffer(cs, lrz->image_view->image);
if (lrz->reuse_previous_state) {
/* Reuse previous LRZ state, LRZ cache is assumed to be
* already invalidated by previous renderpass.
*/
assert(lrz->gpu_dir_tracking);
tu6_write_lrz_reg(cmd, cs,
A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
return;
}
bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
if (invalidate_lrz) {
/* Following the blob we elect to disable LRZ for the whole renderpass
* if it is known that LRZ is disabled somewhere in the renderpass.
*
* This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
* to fail the comparison of depth views.
*/
tu6_disable_lrz_via_depth_view(cmd, cs);
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
} else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
if (lrz->gpu_dir_tracking) {
tu6_write_lrz_reg(cmd, cs,
A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
}
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
.enable = true,
.fc_enable = lrz->fast_clear,
.disable_on_wrong_dir = lrz->gpu_dir_tracking,
));
/* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
* LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
* CUR_DIR_UNSET.
*/
tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
}
if (!lrz->fast_clear && !invalidate_lrz) {
tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
/* Even though we disable fast-clear we still have to dirty
* fast-clear buffer because both secondary cmdbufs and following
* renderpasses won't know that fast-clear is disabled.
*
* TODO: we could avoid this if we don't store depth and don't
* expect secondary cmdbufs.
*/
if (lrz->image_view->image->lrz_fc_size) {
tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image);
}
}
}
void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image);
if (cmd->state.lrz.gpu_dir_tracking) {
tu6_write_lrz_reg(cmd, &cmd->cs,
A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
}
/* Enable flushing of LRZ fast-clear and of direction buffer */
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
.enable = true,
.fc_enable = cmd->state.lrz.fast_clear,
.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
));
} else {
tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0));
}
tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
/* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
* additionally clears direction buffer:
* GRAS_LRZ_DEPTH_VIEW(.dword = 0)
* GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
* A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
* LRZ_CLEAR
* LRZ_FLUSH
* Since it happens after all of the rendering is done there is no known
* reason to do such clear.
*/
}
void
tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
if (!cmd->state.lrz.image_view)
return;
/* Actually, LRZ buffer could be filled in sysmem, in theory to
* be used in another renderpass, but the benefit is rather dubious.
*/
struct tu_lrz_state *lrz = &cmd->state.lrz;
if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
tu_disable_lrz(cmd, cs, lrz->image_view->image);
/* Make sure depth view comparison will fail. */
tu6_write_lrz_reg(cmd, cs,
A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
} else {
tu6_emit_lrz_buffer(cs, lrz->image_view->image);
/* Even though we disable LRZ writes in sysmem mode - there is still
* LRZ test, so LRZ should be cleared.
*/
if (lrz->fast_clear) {
tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
.enable = true,
.fc_enable = true,
));
tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
} else {
tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
}
}
}
void
tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
}
/* Disable LRZ outside of renderpass. */
void
tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_image *image)
{
if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
return;
if (!image->lrz_height)
return;
tu6_emit_lrz_buffer(cs, image);
tu6_disable_lrz_via_depth_view(cmd, cs);
}
/* Clear LRZ, used for out of renderpass depth clears. */
void
tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
struct tu_image *image,
const VkClearDepthStencilValue *pDepthStencil,
uint32_t rangeCount,
const VkImageSubresourceRange *pRanges)
{
if (!rangeCount || !image->lrz_height ||
!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
return;
/* We cannot predict which depth subresource would be used later on,
* so we just pick the first one with depth cleared and clear the LRZ.
*/
const VkImageSubresourceRange *range = NULL;
for (unsigned i = 0; i < rangeCount; i++) {
if (pRanges[i].aspectMask &
(VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
range = &pRanges[i];
break;
}
}
if (!range)
return;
bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f ||
pDepthStencil->depth == 1.f);
tu6_emit_lrz_buffer(&cmd->cs, image);
tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
.base_layer = range->baseArrayLayer,
.layer_count = tu_get_layerCount(image, range),
.base_mip_level = range->baseMipLevel,
));
tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
.enable = true,
.fc_enable = fast_clear,
.disable_on_wrong_dir = true,
));
tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
if (!fast_clear) {
tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
}
}
void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
{
assert(cmd->state.pass);
cmd->state.lrz.valid = false;
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
if (cmd->state.lrz.gpu_dir_tracking) {
tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
.enable = true,
.dir = LRZ_DIR_INVALID,
.disable_on_wrong_dir = true,
));
}
}
/* update lrz state based on stencil-test func:
*
* Conceptually the order of the pipeline is:
*
*
* FS -> Alpha-Test -> Stencil-Test -> Depth-Test
* | |
* if wrmask != 0 if wrmask != 0
* | |
* v v
* Stencil-Write Depth-Write
*
* Because Stencil-Test can have side effects (Stencil-Write) prior
* to depth test, in this case we potentially need to disable early
* lrz-test. See:
*
* https://www.khronos.org/opengl/wiki/Per-Sample_Processing
*/
static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
VkCompareOp func,
bool stencil_write)
{
switch (func) {
case VK_COMPARE_OP_ALWAYS:
/* nothing to do for LRZ, but for stencil test when stencil-
* write is enabled, we need to disable lrz-test, since
* conceptually stencil test and write happens before depth-test.
*/
if (stencil_write) {
return false;
}
break;
case VK_COMPARE_OP_NEVER:
/* fragment never passes, disable lrz_write for this draw. */
gras_lrz_cntl->lrz_write = false;
break;
default:
/* whether the fragment passes or not depends on result
* of stencil test, which we cannot know when doing binning
* pass.
*/
gras_lrz_cntl->lrz_write = false;
/* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
* effects from stencil test we need to disable lrz-test.
*/
if (stencil_write) {
return false;
}
break;
}
return true;
}
static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
const uint32_t a)
{
struct tu_pipeline *pipeline = cmd->state.pipeline;
bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
if (!cmd->state.lrz.valid) {
return gras_lrz_cntl;
}
/* If depth test is disabled we shouldn't touch LRZ.
* Same if there is no depth attachment.
*/
if (a == VK_ATTACHMENT_UNUSED || !z_test_enable ||
(cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ))
return gras_lrz_cntl;
if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
/* Without on-gpu LRZ direction tracking - there is nothing we
* can do to enable LRZ in secondary command buffers.
*/
return gras_lrz_cntl;
}
gras_lrz_cntl.enable = true;
gras_lrz_cntl.lrz_write =
z_write_enable &&
!(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE);
gras_lrz_cntl.z_test_enable = z_read_enable && z_write_enable;
gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
/* See comment in tu_pipeline about disabling LRZ write for blending. */
if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)
gras_lrz_cntl.lrz_write = false;
if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) &&
cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts))
gras_lrz_cntl.lrz_write = false;
/* LRZ is disabled until it is cleared, which means that one "wrong"
* depth test or shader could disable LRZ until depth buffer is cleared.
*/
bool disable_lrz = false;
bool temporary_disable_lrz = false;
/* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth
* or early fragment tests.
*/
if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) {
perf_debug(cmd->device, "Invalidating LRZ due to FS");
disable_lrz = true;
}
/* If Z is not written - it doesn't affect LRZ buffer state.
* Which means two things:
* - Don't lock direction until Z is written for the first time;
* - If Z isn't written and direction IS locked it's possible to just
* temporary disable LRZ instead of fully bailing out, when direction
* is changed.
*/
enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
switch (depth_compare_op) {
case VK_COMPARE_OP_ALWAYS:
case VK_COMPARE_OP_NOT_EQUAL:
/* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
* so if there is a depth write - LRZ must be disabled.
*/
if (z_write_enable) {
perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
disable_lrz = true;
gras_lrz_cntl.dir = LRZ_DIR_INVALID;
} else {
perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
temporary_disable_lrz = true;
}
break;
case VK_COMPARE_OP_EQUAL:
case VK_COMPARE_OP_NEVER:
/* Blob disables LRZ for OP_EQUAL, and from our empirical
* evidence it is a right thing to do.
*
* Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
* we could just temporary disable LRZ.
*/
temporary_disable_lrz = true;
break;
case VK_COMPARE_OP_GREATER:
case VK_COMPARE_OP_GREATER_OR_EQUAL:
lrz_direction = TU_LRZ_GREATER;
gras_lrz_cntl.greater = true;
gras_lrz_cntl.dir = LRZ_DIR_GE;
break;
case VK_COMPARE_OP_LESS:
case VK_COMPARE_OP_LESS_OR_EQUAL:
lrz_direction = TU_LRZ_LESS;
gras_lrz_cntl.greater = false;
gras_lrz_cntl.dir = LRZ_DIR_LE;
break;
default:
unreachable("bad VK_COMPARE_OP value or uninitialized");
break;
};
/* If depthfunc direction is changed, bail out on using LRZ. The
* LRZ buffer encodes a min/max depth value per block, but if
* we switch from GT/GE <-> LT/LE, those values cannot be
* interpreted properly.
*/
if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
lrz_direction != TU_LRZ_UNKNOWN &&
cmd->state.lrz.prev_direction != lrz_direction) {
if (z_write_enable) {
perf_debug(cmd->device, "Invalidating LRZ due to direction change");
disable_lrz = true;
} else {
perf_debug(cmd->device, "Skipping LRZ due to direction change");
temporary_disable_lrz = true;
}
}
/* Consider the following sequence of depthfunc changes:
*
* - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
* LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
* during second VK_COMPARE_OP_GREATER.
*
* - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
* Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
* invalid during COMPARE_OP_LESS.
*
* This shows that we should keep last KNOWN direction.
*/
if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
cmd->state.lrz.prev_direction = lrz_direction;
/* Invalidate LRZ and disable write if stencil test is enabled */
bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
if (!disable_lrz && stencil_test_enable) {
bool stencil_front_writemask =
(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
(cmd->state.dynamic_stencil_wrmask & 0xff) :
(pipeline->stencil_wrmask & 0xff);
bool stencil_back_writemask =
(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
(pipeline->stencil_wrmask & 0xff00) >> 8;
VkCompareOp stencil_front_compare_op =
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
VkCompareOp stencil_back_compare_op =
(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
bool lrz_allowed = true;
lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
&gras_lrz_cntl, stencil_front_compare_op,
stencil_front_writemask);
lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
&gras_lrz_cntl, stencil_back_compare_op,
stencil_back_writemask);
/* Without depth write it's enough to make sure that depth test
* is executed after stencil test, so temporary disabling LRZ is enough.
*/
if (!lrz_allowed) {
if (z_write_enable) {
perf_debug(cmd->device, "Invalidating LRZ due to stencil write");
disable_lrz = true;
} else {
perf_debug(cmd->device, "Skipping LRZ due to stencil write");
temporary_disable_lrz = true;
}
}
}
if (disable_lrz)
cmd->state.lrz.valid = false;
if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
/* Direction byte on GPU should be set to CUR_DIR_DISABLED,
* for this it's not enough to emit empty GRAS_LRZ_CNTL.
*/
gras_lrz_cntl.enable = true;
gras_lrz_cntl.dir = LRZ_DIR_INVALID;
return gras_lrz_cntl;
}
if (temporary_disable_lrz)
gras_lrz_cntl.enable = false;
cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
if (!cmd->state.lrz.enabled)
memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
return gras_lrz_cntl;
}
void
tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl));
tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
}

View File

@ -265,6 +265,7 @@ enum tu_debug_flags
TU_DEBUG_LAYOUT = 1 << 16,
TU_DEBUG_LOG_SKIP_GMEM_OPS = 1 << 17,
TU_DEBUG_PERF = 1 << 18,
TU_DEBUG_NOLRZFC = 1 << 19,
};
struct tu_instance
@ -1126,11 +1127,16 @@ struct tu_lrz_pipeline
struct tu_lrz_state
{
/* Depth/Stencil image currently on use to do LRZ */
struct tu_image *image;
const struct tu_image_view *image_view;
VkClearValue depth_clear_value;
/* If LRZ is in invalid state we cannot use it until depth is cleared */
bool valid : 1;
/* Allows to temporary disable LRZ */
bool enabled : 1;
bool fast_clear : 1;
bool gpu_dir_tracking : 1;
/* Continue using old LRZ state (LOAD_OP_LOAD of depth) */
bool reuse_previous_state : 1;
enum tu_lrz_direction prev_direction;
};
@ -1535,6 +1541,51 @@ struct tu_pipeline
struct util_dynarray executables;
};
struct tu_image;
void
tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
void
tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image);
void
tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_image *image);
void
tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
struct tu_image *image,
const VkClearDepthStencilValue *pDepthStencil,
uint32_t rangeCount,
const VkImageSubresourceRange *pRanges);
void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
const VkRenderPassBeginInfo *pRenderPassBegin);
void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd,
struct tu_framebuffer *fb);
void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd);
void
tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
bool z_negative_one_to_one);
@ -1542,9 +1593,6 @@ tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_vie
void
tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
void
tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
void
tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
@ -1691,6 +1739,8 @@ struct tu_image
uint32_t lrz_height;
uint32_t lrz_pitch;
uint32_t lrz_offset;
uint32_t lrz_fc_offset;
uint32_t lrz_fc_size;
bool shareable;
};