mesa/src/freedreno/vulkan/tu_lrz.c

/*
 * Copyright © 2022 Igalia S.L.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "tu_private.h"

#include "tu_cs.h"

/* Low-resolution Z buffer is very similar to a depth prepass that helps
 * the HW avoid executing the fragment shader on those fragments that will
 * be subsequently discarded by the depth test afterwards.
 *
 * The interesting part of this feature is that it allows applications
 * to submit the vertices in any order.
 *
 * In the binning pass it is possible to store the depth value of each
 * vertex into internal low resolution depth buffer and quickly test
 * the primitives against it during the render pass.
 *
 * There are a number of limitations when LRZ cannot be used:
 * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
 * - Writing to stencil buffer
 * - Writing depth while:
 *   - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
 *   - Using OP_ALWAYS or OP_NOT_EQUAL;
 * - Clearing depth with vkCmdClearAttachments;
 * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
 * - (pre-a650) Using secondary command buffers;
 * - Sysmem rendering (with small caveat).
 *
 * Pre-a650 (before gen3)
 * ======================
 *
 * The direction is fully tracked on CPU. In renderpass LRZ starts with
 * unknown direction, the direction is set first time when depth write occurs
 * and if it does change afterwards - direction becomes invalid and LRZ is
 * disabled for the rest of the renderpass.
 *
 * Since direction is not tracked by GPU - it's impossible to know whether
 * LRZ is enabled during construction of secondary command buffers.
 *
 * For the same reason it's impossible to reuse LRZ between renderpasses.
 *
 * A650+ (gen3+)
 * =============
 *
 * Now LRZ direction could be tracked on GPU. There are to parts:
 * - Direction byte which stores current LRZ direction;
 * - Parameters of the last used depth view.
 *
 * The idea is the same as when LRZ tracked on CPU: when GRAS_LRZ_CNTL
 * is used - its direction is compared to previously known direction
 * and direction byte is set to disabled when directions are incompatible.
 *
 * Additionally, to reuse LRZ between renderpasses, GRAS_LRZ_CNTL checks
 * if current value of GRAS_LRZ_DEPTH_VIEW is equal to the value
 * stored in the buffer, if not - LRZ is disabled. (This is necessary
 * because depth buffer may have several layers and mip levels, on the
 * other hand LRZ buffer represents only a single layer + mip level).
 *
 * LRZ direction between renderpasses is disabled when underlying depth
 * buffer is changed, the following commands could change depth image:
 * - vkCmdBlitImage*
 * - vkCmdCopyBufferToImage*
 * - vkCmdCopyImage*
 *
 * LRZ Fast-Clear
 * ==============
 *
 * The LRZ fast-clear buffer is initialized to zeroes and read/written
 * when GRAS_LRZ_CNTL.FC_ENABLE (b3) is set. It appears to store 1b/block.
 * '0' means block has original depth clear value, and '1' means that the
 * corresponding block in LRZ has been modified.
 *
 * LRZ fast-clear conservatively clears LRZ buffer, at the point where LRZ is
 * written the LRZ block which corresponds to a single fast-clear bit is cleared:
 * - To 0.0 if depth comparison is GREATER;
 * - To 1.0 if depth comparison is LESS;
 *
 * This way it's always valid to fast-clear. On the other hand we disable
 * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
 * for perf if some primitives are expected to fail depth test against the
 * actual depth clear value.
 *
 * LRZ Precision
 * =============
 *
 * LRZ always uses Z16_UNORM. The epsilon for it is 1.f / (1 << 16) which is
 * not enough to represent all values of Z32_UNORM or Z32_FLOAT.
 * This especially rises questions in context of fast-clear, if fast-clear
 * uses a value which cannot be precisely represented by LRZ - we wouldn't
 * be able to round it in the correct direction since direction is tracked
 * on GPU.
 *
 * However, it seems that depth comparisons with LRZ values have some "slack"
 * and nothing special should be done for such depth clear values.
 *
 * How it was tested:
 * - Clear Z32_FLOAT attachment to 1.f / (1 << 17)
 *   - LRZ buffer contains all zeroes
 * - Do draws and check whether all samples are passing:
 *   - OP_GREATER with (1.f / (1 << 17) + float32_epsilon) - passing;
 *   - OP_GREATER with (1.f / (1 << 17) - float32_epsilon) - not passing;
 *   - OP_LESS with (1.f / (1 << 17) - float32_epsilon) - samples;
 *   - OP_LESS with() 1.f / (1 << 17) + float32_epsilon) - not passing;
 *   - OP_LESS_OR_EQ with (1.f / (1 << 17) + float32_epsilon) - not passing;
 * In all cases resulting LRZ buffer is all zeroes and LRZ direction is updated.
 *
 * LRZ Caches
 * ==========
 *
 * ! The policy here is to flush LRZ cache right after it is changed,
 * so if LRZ data is needed afterwards - there is no need to flush it
 * before using LRZ.
 *
 * LRZ_FLUSH flushes and invalidates LRZ caches, there are two caches:
 * - Cache for fast-clear buffer;
 * - Cache for direction byte + depth view params.
 * They could be cleared by LRZ_CLEAR. To become visible in GPU memory
 * the caches should be flushed with LRZ_FLUSH afterwards.
 *
 * GRAS_LRZ_CNTL reads from these caches.
 */

static void
tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
{
   if (!depth_image) {
      tu_cs_emit_regs(cs,
                      A6XX_GRAS_LRZ_BUFFER_BASE(0),
                      A6XX_GRAS_LRZ_BUFFER_PITCH(0),
                      A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
      return;
   }

   uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
   uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
   if (!depth_image->lrz_fc_offset)
      lrz_fc_iova = 0;

   tu_cs_emit_regs(cs,
                   A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
                   A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
                   A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
}

static void
tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                  struct tu_reg_value reg)
{
   if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
      tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
      tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
      tu_cs_emit(cs, reg.reg);
      tu_cs_emit(cs, reg.value);
   } else {
      tu_cs_emit_pkt4(cs, reg.reg, 1);
      tu_cs_emit(cs, reg.value);
   }
}

static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
   /* Disable direction by writing invalid depth view. */
   tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
      .base_layer = 0b11111111111,
      .layer_count = 0b11111111111,
      .base_mip_level = 0b1111,
   ));

   tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
      .enable = true,
      .disable_on_wrong_dir = true,
   ));

   tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
}

static void
tu_lrz_init_state(struct tu_cmd_buffer *cmd,
                  const struct tu_render_pass_attachment *att,
                  const struct tu_image_view *view)
{
   if (!view->image->lrz_height) {
      assert((cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ) ||
             !vk_format_has_depth(att->format));
      return;
   }

   bool clears_depth = att->clear_mask &
      (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
   bool has_gpu_tracking =
      cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;

   if (!has_gpu_tracking && !clears_depth)
      return;

   /* We need to always have an LRZ view just to disable it if there is a
    * depth attachment, there are any secondaries, and GPU tracking is
    * enabled, in order not to rely on loadOp state which doesn't exist with
    * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
    * enabled and there will be a NULL/garbage LRZ buffer.
    */
   cmd->state.lrz.image_view = view;

   if (!clears_depth && !att->load)
      return;

   cmd->state.lrz.valid = true;
   cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
   /* Be optimistic and unconditionally enable fast-clear in
    * secondary cmdbufs and when reusing previous LRZ state.
    */
   cmd->state.lrz.fast_clear = view->image->lrz_fc_size > 0;

   cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
   cmd->state.lrz.reuse_previous_state = !clears_depth;
}

/* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
 * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
 * dynamically disabled).
 */

static void
tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
                      const struct tu_render_pass_attachment *att)
{
   bool has_gpu_tracking =
      cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;

   if (!has_gpu_tracking)
      return;

   if (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ)
      return;

   if (!vk_format_has_depth(att->format))
      return;

   cmd->state.lrz.valid = true;
   cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
   cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;

   /* We may not have the depth attachment when executing in a secondary
    * inside a render pass. This means we have to be even more optimistic than
    * the normal case and enable fast clear even if the depth image doesn't
    * support it.
    */
   cmd->state.lrz.fast_clear = true;

   /* These are not used inside secondaries */
   cmd->state.lrz.image_view = NULL;
   cmd->state.lrz.reuse_previous_state = false;
}

void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
                        const VkRenderPassBeginInfo *pRenderPassBegin)
{
   const struct tu_render_pass *pass = cmd->state.pass;

   int lrz_img_count = 0;
   for (unsigned i = 0; i < pass->attachment_count; i++) {
      if (cmd->state.attachments[i]->image->lrz_height)
         lrz_img_count++;
   }

   if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
       cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
      /* Theoretically we could switch between LRZ buffers during the binning
       * and tiling passes, but it is untested and would add complexity for
       * presumably extremely rare case.
       */
      perf_debug(cmd->device,
                 "Invalidating LRZ because there are several subpasses with "
                 "different depth attachments in a single renderpass");

      for (unsigned i = 0; i < pass->attachment_count; i++) {
         struct tu_image *image = cmd->state.attachments[i]->image;
         tu_disable_lrz(cmd, &cmd->cs, image);
      }
   }

    /* Track LRZ valid state */
   memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
   uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
   if (a != VK_ATTACHMENT_UNUSED) {
      const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
      tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
      if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
         VkClearValue clear = pRenderPassBegin->pClearValues[a];
         cmd->state.lrz.depth_clear_value = clear;
         cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
                                     (clear.depthStencil.depth == 0.f ||
                                      clear.depthStencil.depth == 1.f);
      }
      cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
   }

   if (!cmd->state.lrz.valid) {
      tu6_emit_lrz_buffer(&cmd->cs, NULL);
   }
}

void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
{
   memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
   uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
   if (a != VK_ATTACHMENT_UNUSED) {
      const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
      tu_lrz_init_secondary(cmd, att);
   }
}

void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
   /* TODO: If lrz was never valid for the entire renderpass, we could exit
    * early here. Sometimes we know this ahead of time and null out
    * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
    * no secondaries.
    */
   if (!cmd->state.lrz.image_view)
      return;

   struct tu_lrz_state *lrz = &cmd->state.lrz;

   tu6_emit_lrz_buffer(cs, lrz->image_view->image);

   if (lrz->reuse_previous_state) {
      /* Reuse previous LRZ state, LRZ cache is assumed to be
       * already invalidated by previous renderpass.
       */
      assert(lrz->gpu_dir_tracking);

      tu6_write_lrz_reg(cmd, cs,
         A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
      return;
   }

   bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
   if (invalidate_lrz) {
      /* Following the blob we elect to disable LRZ for the whole renderpass
       * if it is known that LRZ is disabled somewhere in the renderpass.
       *
       * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
       * to fail the comparison of depth views.
       */
      tu6_disable_lrz_via_depth_view(cmd, cs);
      tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
   } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
      if (lrz->gpu_dir_tracking) {
         tu6_write_lrz_reg(cmd, cs,
            A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
      }

      tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
         .enable = true,
         .fc_enable = lrz->fast_clear,
         .disable_on_wrong_dir = lrz->gpu_dir_tracking,
      ));

      /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
       * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
       *  CUR_DIR_UNSET.
       */
      tu6_emit_event_write(cmd, cs, LRZ_CLEAR);
   }

   if (!lrz->fast_clear && !invalidate_lrz) {
      tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);

      /* Even though we disable fast-clear we still have to dirty
       * fast-clear buffer because both secondary cmdbufs and following
       * renderpasses won't know that fast-clear is disabled.
       *
       * TODO: we could avoid this if we don't store depth and don't
       * expect secondary cmdbufs.
       */
      if (lrz->image_view->image->lrz_fc_size) {
         tu6_dirty_lrz_fc(cmd, cs, lrz->image_view->image);
      }
   }
}

void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
   if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
      tu6_emit_lrz_buffer(cs, cmd->state.lrz.image_view->image);

      if (cmd->state.lrz.gpu_dir_tracking) {
         tu6_write_lrz_reg(cmd, &cmd->cs,
            A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
      }

      /* Enable flushing of LRZ fast-clear and of direction buffer */
      tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(
         .enable = true,
         .fc_enable = cmd->state.lrz.fast_clear,
         .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
      ));
   } else {
      tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(0));
   }

   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);

   /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
    * additionally clears direction buffer:
    *  GRAS_LRZ_DEPTH_VIEW(.dword = 0)
    *  GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
    *  A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
    *  LRZ_CLEAR
    *  LRZ_FLUSH
    * Since it happens after all of the rendering is done there is no known
    * reason to do such clear.
    */
}

void
tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
   if (!cmd->state.lrz.image_view)
      return;

   /* Actually, LRZ buffer could be filled in sysmem, in theory to
    * be used in another renderpass, but the benefit is rather dubious.
    */

   struct tu_lrz_state *lrz = &cmd->state.lrz;

   if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
      tu_disable_lrz(cmd, cs, lrz->image_view->image);
      /* Make sure depth view comparison will fail. */
      tu6_write_lrz_reg(cmd, cs,
         A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
   } else {
      tu6_emit_lrz_buffer(cs, lrz->image_view->image);
      /* Even though we disable LRZ writes in sysmem mode - there is still
       * LRZ test, so LRZ should be cleared.
       */
      if (lrz->fast_clear) {
         tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
            .enable = true,
            .fc_enable = true,
         ));
         tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
         tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
      } else {
         tu6_clear_lrz(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
      }
   }
}

void
tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
   tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);
}

/* Disable LRZ outside of renderpass. */
void
tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
               struct tu_image *image)
{
   if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
      return;

   if (!image->lrz_height)
      return;

   tu6_emit_lrz_buffer(cs, image);
   tu6_disable_lrz_via_depth_view(cmd, cs);
}

/* Clear LRZ, used for out of renderpass depth clears. */
void
tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
                         struct tu_image *image,
                         const VkClearDepthStencilValue *pDepthStencil,
                         uint32_t rangeCount,
                         const VkImageSubresourceRange *pRanges)
{
   if (!rangeCount || !image->lrz_height ||
       !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
      return;

   /* We cannot predict which depth subresource would be used later on,
    * so we just pick the first one with depth cleared and clear the LRZ.
    */
   const VkImageSubresourceRange *range = NULL;
   for (unsigned i = 0; i < rangeCount; i++) {
      if (pRanges[i].aspectMask &
            (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
         range = &pRanges[i];
         break;
      }
   }

   if (!range)
      return;

   bool fast_clear = image->lrz_fc_size && (pDepthStencil->depth == 0.f ||
                                            pDepthStencil->depth == 1.f);

   tu6_emit_lrz_buffer(&cmd->cs, image);

   tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
         .base_layer = range->baseArrayLayer,
         .layer_count = tu_get_layerCount(image, range),
         .base_mip_level = range->baseMipLevel,
   ));

   tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
      .enable = true,
      .fc_enable = fast_clear,
      .disable_on_wrong_dir = true,
   ));

   tu6_emit_event_write(cmd, &cmd->cs, LRZ_CLEAR);
   tu6_emit_event_write(cmd, &cmd->cs, LRZ_FLUSH);

   if (!fast_clear) {
      tu6_clear_lrz(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
   }
}

void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
{
   assert(cmd->state.pass);

   cmd->state.lrz.valid = false;
   cmd->state.dirty |= TU_CMD_DIRTY_LRZ;

   if (cmd->state.lrz.gpu_dir_tracking) {
      tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_CNTL(
         .enable = true,
         .dir = LRZ_DIR_INVALID,
         .disable_on_wrong_dir = true,
      ));
   }
}

/* update lrz state based on stencil-test func:
 *
 * Conceptually the order of the pipeline is:
 *
 *
 *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
 *                              |                |
 *                       if wrmask != 0     if wrmask != 0
 *                              |                |
 *                              v                v
 *                        Stencil-Write      Depth-Write
 *
 * Because Stencil-Test can have side effects (Stencil-Write) prior
 * to depth test, in this case we potentially need to disable early
 * lrz-test. See:
 *
 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
 */
static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
                           VkCompareOp func,
                           bool stencil_write)
{
   switch (func) {
   case VK_COMPARE_OP_ALWAYS:
      /* nothing to do for LRZ, but for stencil test when stencil-
       * write is enabled, we need to disable lrz-test, since
       * conceptually stencil test and write happens before depth-test.
       */
      if (stencil_write) {
         return false;
      }
      break;
   case VK_COMPARE_OP_NEVER:
      /* fragment never passes, disable lrz_write for this draw. */
      gras_lrz_cntl->lrz_write = false;
      break;
   default:
      /* whether the fragment passes or not depends on result
       * of stencil test, which we cannot know when doing binning
       * pass.
       */
      gras_lrz_cntl->lrz_write = false;
      /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
       * effects from stencil test we need to disable lrz-test.
       */
      if (stencil_write) {
         return false;
      }
      break;
   }

   return true;
}

static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
                        const uint32_t a)
{
   struct tu_pipeline *pipeline = cmd->state.pipeline;
   bool z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
   bool z_write_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
   bool z_read_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
   bool z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
   VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;

   struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };

   if (!cmd->state.lrz.valid) {
      return gras_lrz_cntl;
   }

   /* If depth test is disabled we shouldn't touch LRZ.
    * Same if there is no depth attachment.
    */
   if (a == VK_ATTACHMENT_UNUSED || !z_test_enable ||
       (cmd->device->instance->debug_flags & TU_DEBUG_NOLRZ))
      return gras_lrz_cntl;

   if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
      /* Without on-gpu LRZ direction tracking - there is nothing we
       * can do to enable LRZ in secondary command buffers.
       */
      return gras_lrz_cntl;
   }

   gras_lrz_cntl.enable = true;
   gras_lrz_cntl.lrz_write =
      z_write_enable &&
      !(pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE);
   gras_lrz_cntl.z_test_enable = z_read_enable && z_write_enable;
   gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
   gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
   gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
   gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;

   /* See comment in tu_pipeline about disabling LRZ write for blending. */
   if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
       cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)
      gras_lrz_cntl.lrz_write = false;

   if ((cmd->state.pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) &&
        cmd->state.color_write_enable != MASK(cmd->state.pipeline->num_rts))
      gras_lrz_cntl.lrz_write = false;

   /* LRZ is disabled until it is cleared, which means that one "wrong"
    * depth test or shader could disable LRZ until depth buffer is cleared.
    */
   bool disable_lrz = false;
   bool temporary_disable_lrz = false;

   /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth
    * or early fragment tests.
    */
   if (pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ) {
      perf_debug(cmd->device, "Invalidating LRZ due to FS");
      disable_lrz = true;
   }

   /* If Z is not written - it doesn't affect LRZ buffer state.
    * Which means two things:
    * - Don't lock direction until Z is written for the first time;
    * - If Z isn't written and direction IS locked it's possible to just
    *   temporary disable LRZ instead of fully bailing out, when direction
    *   is changed.
    */

   enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
   switch (depth_compare_op) {
   case VK_COMPARE_OP_ALWAYS:
   case VK_COMPARE_OP_NOT_EQUAL:
      /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
       * so if there is a depth write - LRZ must be disabled.
       */
      if (z_write_enable) {
         perf_debug(cmd->device, "Invalidating LRZ due to ALWAYS/NOT_EQUAL");
         disable_lrz = true;
         gras_lrz_cntl.dir = LRZ_DIR_INVALID;
      } else {
         perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
         temporary_disable_lrz = true;
      }
      break;
   case VK_COMPARE_OP_EQUAL:
   case VK_COMPARE_OP_NEVER:
      /* Blob disables LRZ for OP_EQUAL, and from our empirical
       * evidence it is a right thing to do.
       *
       * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
       * we could just temporary disable LRZ.
       */
      temporary_disable_lrz = true;
      break;
   case VK_COMPARE_OP_GREATER:
   case VK_COMPARE_OP_GREATER_OR_EQUAL:
      lrz_direction = TU_LRZ_GREATER;
      gras_lrz_cntl.greater = true;
      gras_lrz_cntl.dir = LRZ_DIR_GE;
      break;
   case VK_COMPARE_OP_LESS:
   case VK_COMPARE_OP_LESS_OR_EQUAL:
      lrz_direction = TU_LRZ_LESS;
      gras_lrz_cntl.greater = false;
      gras_lrz_cntl.dir = LRZ_DIR_LE;
      break;
   default:
      unreachable("bad VK_COMPARE_OP value or uninitialized");
      break;
   };

   /* If depthfunc direction is changed, bail out on using LRZ. The
    * LRZ buffer encodes a min/max depth value per block, but if
    * we switch from GT/GE <-> LT/LE, those values cannot be
    * interpreted properly.
    */
   if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
       lrz_direction != TU_LRZ_UNKNOWN &&
       cmd->state.lrz.prev_direction != lrz_direction) {
      if (z_write_enable) {
         perf_debug(cmd->device, "Invalidating LRZ due to direction change");
         disable_lrz = true;
      } else {
         perf_debug(cmd->device, "Skipping LRZ due to direction change");
         temporary_disable_lrz = true;
      }
   }

   /* Consider the following sequence of depthfunc changes:
    *
    * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
    * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
    * during second VK_COMPARE_OP_GREATER.
    *
    * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
    * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
    * invalid during COMPARE_OP_LESS.
    *
    * This shows that we should keep last KNOWN direction.
    */
   if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
      cmd->state.lrz.prev_direction = lrz_direction;

   /* Invalidate LRZ and disable write if stencil test is enabled */
   bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
   if (!disable_lrz && stencil_test_enable) {
      bool stencil_front_writemask =
         (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
         (cmd->state.dynamic_stencil_wrmask & 0xff) :
         (pipeline->stencil_wrmask & 0xff);

      bool stencil_back_writemask =
         (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
         ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
         (pipeline->stencil_wrmask & 0xff00) >> 8;

      VkCompareOp stencil_front_compare_op =
         (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;

      VkCompareOp stencil_back_compare_op =
         (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;

      bool lrz_allowed = true;
      lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
                                      &gras_lrz_cntl, stencil_front_compare_op,
                                      stencil_front_writemask);

      lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
                                      &gras_lrz_cntl, stencil_back_compare_op,
                                      stencil_back_writemask);

      /* Without depth write it's enough to make sure that depth test
       * is executed after stencil test, so temporary disabling LRZ is enough.
       */
      if (!lrz_allowed) {
         if (z_write_enable) {
            perf_debug(cmd->device, "Invalidating LRZ due to stencil write");
            disable_lrz = true;
         } else {
            perf_debug(cmd->device, "Skipping LRZ due to stencil write");
            temporary_disable_lrz = true;
         }
      }
   }

   if (disable_lrz)
      cmd->state.lrz.valid = false;

   if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
      /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
       * for this it's not enough to emit empty GRAS_LRZ_CNTL.
       */
      gras_lrz_cntl.enable = true;
      gras_lrz_cntl.dir = LRZ_DIR_INVALID;

      return gras_lrz_cntl;
   }

   if (temporary_disable_lrz)
      gras_lrz_cntl.enable = false;

   cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
   if (!cmd->state.lrz.enabled)
      memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));

   return gras_lrz_cntl;
}

void
tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
   const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
   struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);

   tu6_write_lrz_reg(cmd, cs, pack_A6XX_GRAS_LRZ_CNTL(gras_lrz_cntl));
   tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
}