turnip: Split the tiling config into separate layouts based on CCU usage.

We now choose between two (equal as of this commit) layouts based on whether the renderpass's stores will use the CCU space, and assert that we always know the chosen layout when we go using the gmem offsets. This required making vkCmdClearAttachments in a secondary take the 3D path instead of gmem blits, since secondaries only have to be compatible with the primary's renderpass, rather than equal. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16921>
2022-07-21 14:58:04 -07:00 · 2022-07-21 14:58:04 -07:00 · b8a334b547
parent a1db4fcab7
commit b8a334b547
6 changed files with 368 additions and 252 deletions
--- a/src/freedreno/vulkan/tu_clear_blit.c
+++ b/src/freedreno/vulkan/tu_clear_blit.c
@ -1039,7 +1039,7 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
   desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
   desc[2] =
      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
-      A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
+      A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
   desc[3] = 0;
   desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
   desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
@ -2717,13 +2717,14 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
   enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
   if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
      if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, att->gmem_offset, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att), value);
      if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att), value);
      return;
   }

-   clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), att->gmem_offset, value);
+   clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask),
+                         tu_attachment_gmem_offset(cmd, att), value);

   trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
 }
@ -2789,12 +2790,15 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
      tu_lrz_disable_during_renderpass(cmd);
   }

-   /* vkCmdClearAttachments is supposed to respect the predicate if active.
-    * The easiest way to do this is to always use the 3d path, which always
-    * works even with GMEM because it's just a simple draw using the existing
+   /* vkCmdClearAttachments is supposed to respect the predicate if active. The
+    * easiest way to do this is to always use the 3d path, which always works
+    * even with GMEM because it's just a simple draw using the existing
    * attachment state.
+    *
+    * Similarly, we also use the 3D path when in a secondary command buffer that
+    * doesn't know the GMEM layout that will be chosen by the primary.
    */
-   if (cmd->state.predication_active) {
+   if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
      tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
      return;
   }
@ -2981,10 +2985,10 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,

   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
         tu_cs_emit_regs(cs,
-                        A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
+                        A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment)));
   } else {
      tu_cs_emit_regs(cs,
-                     A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
+                     A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment)));
   }

   tu6_emit_event_write(cmd, cs, BLIT);
@ -3156,7 +3160,7 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
                   /* note: src size does not matter when not scaling */
                   A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
                   A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
-                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
+                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp));

   /* sync GMEM writes with CACHE. */
   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
@ -3243,6 +3247,61 @@ store_3d_blit(struct tu_cmd_buffer *cmd,
                  CP_SCRATCH_TO_REG_0_CNT(1 - 1));
 }

+static bool
+tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
+{
+   struct tu_physical_device *phys_dev = cmd->device->physical_device;
+   const struct tu_image_view *iview = cmd->state.attachments[a];
+   const VkRect2D *render_area = &cmd->state.render_area;
+
+   /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
+   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
+      return true;
+
+   uint32_t x1 = render_area->offset.x;
+   uint32_t y1 = render_area->offset.y;
+   uint32_t x2 = x1 + render_area->extent.width;
+   uint32_t y2 = y1 + render_area->extent.height;
+   /* x2/y2 can be unaligned if equal to the size of the image, since it will
+    * write into padding space. The one exception is linear levels which don't
+    * have the required y padding in the layout (except for the last level)
+    */
+   bool need_y2_align =
+      y2 != iview->view.height || iview->view.need_y2_align;
+
+   return (x1 % phys_dev->info->gmem_align_w ||
+           (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
+           y1 % phys_dev->info->gmem_align_h ||
+           (y2 % phys_dev->info->gmem_align_h && need_y2_align));
+}
+
+/* Choose the GMEM layout (use the CCU space or not) based on whether the
+ * current attachments will need.  This has to happen at vkBeginRenderPass()
+ * time because tu_attachment_store_unaligned() looks at the image views, which
+ * are only available at that point.  This should match the logic for the
+ * !unaligned case in tu_store_gmem_attachment().
+ */
+void
+tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
+{
+   cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
+
+   for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
+      if (!cmd->state.attachments[i])
+         continue;
+
+      struct tu_render_pass_attachment *att =
+         &cmd->state.pass->attachments[i];
+      if ((att->store || att->store_stencil) &&
+          tu_attachment_store_unaligned(cmd, i))
+         cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
+      if (att->will_be_resolved && !blit_can_resolve(att->format))
+         cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
+   }
+
+   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+}
+
 void
 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                         struct tu_cs *cs,
@ -3250,7 +3309,6 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                         uint32_t gmem_a,
                         bool cond_exec_allowed)
 {
-   struct tu_physical_device *phys_dev = cmd->device->physical_device;
   const VkRect2D *render_area = &cmd->state.render_area;
   struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
   const struct tu_image_view *iview = cmd->state.attachments[a];
@ -3267,26 +3325,7 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
      tu_begin_load_store_cond_exec(cmd, cs, false);
   }

-   uint32_t x1 = render_area->offset.x;
-   uint32_t y1 = render_area->offset.y;
-   uint32_t x2 = x1 + render_area->extent.width;
-   uint32_t y2 = y1 + render_area->extent.height;
-   /* x2/y2 can be unaligned if equal to the size of the image,
-    * since it will write into padding space
-    * the one exception is linear levels which don't have the
-    * required y padding in the layout (except for the last level)
-    */
-   bool need_y2_align =
-      y2 != iview->view.height || iview->view.need_y2_align;
-
-   bool unaligned =
-      x1 % phys_dev->info->gmem_align_w ||
-      (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
-      y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);
-
-   /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
-   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
-      unaligned = true;
+   bool unaligned = tu_attachment_store_unaligned(cmd, a);

   /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
    * one for depth and other for stencil. When resolving a MSAA
@ -3324,6 +3363,8 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
      return;
   }

+   assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
+
   enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format);
   if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
      src_format = PIPE_FORMAT_Z32_FLOAT;
@ -3345,23 +3386,23 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,

      if (store_common) {
         store_3d_blit(cmd, cs, iview, dst->samples, false, src_format,
-                       dst_format, render_area, src->gmem_offset, src->cpp);
+                       dst_format, render_area, tu_attachment_gmem_offset(cmd, src), src->cpp);
      }
      if (store_separate_stencil) {
         store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
                       PIPE_FORMAT_S8_UINT, render_area,
-                       src->gmem_offset_stencil, src->samples);
+                       tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
      }
   } else {
      r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);

      if (store_common) {
         store_cp_blit(cmd, cs, iview, src->samples, false, src_format,
-                       dst_format, src->gmem_offset, src->cpp);
+                       dst_format, tu_attachment_gmem_offset(cmd, src), src->cpp);
      }
      if (store_separate_stencil) {
         store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
-                       PIPE_FORMAT_S8_UINT, src->gmem_offset_stencil, src->samples);
+                       PIPE_FORMAT_S8_UINT, tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
      }
   }

--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@ -235,7 +235,7 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
      tu_cs_image_depth_ref(cs, iview, 0);
   else
      tu_cs_image_ref(cs, &iview->view, 0);
-   tu_cs_emit(cs, attachment->gmem_offset);
+   tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment));

   tu_cs_emit_regs(cs,
                   A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
@ -250,10 +250,10 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
      tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
      if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
         tu_cs_image_stencil_ref(cs, iview, 0);
-         tu_cs_emit(cs, attachment->gmem_offset_stencil);
+         tu_cs_emit(cs, tu_attachment_gmem_offset_stencil(cmd, attachment));
      } else {
         tu_cs_image_ref(cs, &iview->view, 0);
-         tu_cs_emit(cs, attachment->gmem_offset);
+         tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment));
      }
   } else {
      tu_cs_emit_regs(cs,
@ -294,7 +294,7 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
      tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
      tu_cs_emit(cs, iview->view.RB_MRT_BUF_INFO);
      tu_cs_image_ref(cs, &iview->view, 0);
-      tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
+      tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, &cmd->state.pass->attachments[a]));

      tu_cs_emit_regs(cs,
                      A6XX_SP_FS_MRT_REG(i, .dword = iview->view.SP_FS_MRT_REG));
@ -565,6 +565,7 @@ static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];

   /* XFB commands are emitted for BINNING || SYSMEM, which makes it
    * incompatible with non-hw binning GMEM rendering. this is required because
@ -573,7 +574,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
    * XFB was used.
    */
   if (cmd->state.rp.xfb_used) {
-      assert(fb->binning_possible);
+      assert(tiling->binning_possible);
      return true;
   }

@ -584,11 +585,11 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
    */
   if (cmd->state.rp.has_prim_generated_query_in_rp ||
       cmd->state.prim_generated_query_running_before_rp) {
-      assert(fb->binning_possible);
+      assert(tiling->binning_possible);
      return true;
   }

-   return fb->binning;
+   return tiling->binning;
 }

 static bool
@ -599,7 +600,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
      return true;

   /* can't fit attachments into gmem */
-   if (!cmd->state.pass->gmem_pixels)
+   if (!cmd->state.pass->gmem_pixels[cmd->state.gmem_layout])
      return true;

   if (cmd->state.framebuffer->layers > 1)
@ -617,7 +618,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
      return true;

   /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
-   if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible)
+   if (cmd->state.rp.xfb_used && !cmd->state.tiling->binning_possible)
      return true;

   /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
@ -625,7 +626,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
    */
   if ((cmd->state.rp.has_prim_generated_query_in_rp ||
        cmd->state.prim_generated_query_running_before_rp) &&
-       !cmd->state.framebuffer->binning_possible)
+       !cmd->state.tiling->binning_possible)
      return true;

   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_GMEM))
@ -649,7 +650,7 @@ static void
 tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                              uint32_t pipe, uint32_t slot, bool wfm)
 {
-   if (cmd->state.framebuffer->binning_possible) {
+   if (cmd->state.tiling->binning_possible) {
      tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
      tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) |
                     A6XX_CP_REG_TEST_0_BIT(slot) |
@ -664,15 +665,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                     struct tu_cs *cs,
                     uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;

   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));

-   const uint32_t x1 = fb->tile0.width * tx;
-   const uint32_t y1 = fb->tile0.height * ty;
-   const uint32_t x2 = MIN2(x1 + fb->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
-   const uint32_t y2 = MIN2(y1 + fb->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
+   const uint32_t x1 = tiling->tile0.width * tx;
+   const uint32_t y1 = tiling->tile0.height * ty;
+   const uint32_t x2 = MIN2(x1 + tiling->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
+   const uint32_t y2 = MIN2(y1 + tiling->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
   tu6_emit_window_scissor(cs, x1, y1, x2, y2);
   tu6_emit_window_offset(cs, x1, y1);

@ -685,7 +686,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
      tu_cs_emit(cs, 0x0);

      tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
-      tu_cs_emit(cs, fb->pipe_sizes[pipe] |
+      tu_cs_emit(cs, tiling->pipe_sizes[pipe] |
                     CP_SET_BIN_DATA5_0_VSC_N(slot));
      tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
      tu_cs_emit(cs, pipe * 4);
@ -769,7 +770,7 @@ tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
   tu6_emit_blit_scissor(cmd, cs, true);

   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
-      tu_load_gmem_attachment(cmd, cs, i, cmd->state.framebuffer->binning, false);
+      tu_load_gmem_attachment(cmd, cs, i, cmd->state.tiling->binning, false);
 }

 static void
@ -787,8 +788,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
   tu6_emit_blit_scissor(cmd, cs, true);

   for (uint32_t a = 0; a < pass->attachment_count; ++a) {
-      if (pass->attachments[a].gmem_offset >= 0)
-         tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.framebuffer->binning_possible);
+      if (pass->attachments[a].gmem)
+         tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.tiling->binning_possible);
   }

   if (subpass->resolve_attachments) {
@ -965,18 +966,18 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 static void
 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;

   tu_cs_emit_regs(cs,
-                   A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
-                                     .height = fb->tile0.height));
+                   A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width,
+                                     .height = tiling->tile0.height));

   tu_cs_emit_regs(cs,
-                   A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
-                                      .ny = fb->tile_count.height));
+                   A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
+                                      .ny = tiling->tile_count.height));

   tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
-   tu_cs_emit_array(cs, fb->pipe_config, 32);
+   tu_cs_emit_array(cs, tiling->pipe_config, 32);

   tu_cs_emit_regs(cs,
                   A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
@ -990,9 +991,9 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 static void
 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
   const uint32_t used_pipe_count =
-      fb->pipe_count.width * fb->pipe_count.height;
+      tiling->pipe_count.width * tiling->pipe_count.height;

   for (int i = 0; i < used_pipe_count; i++) {
      tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
@ -1110,6 +1111,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
                          const struct tu_subpass *subpass,
                          bool gmem)
 {
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
+
   /* note: we can probably emit input attachments just once for the whole
    * renderpass, this would avoid emitting both sysmem/gmem versions
    *
@ -1140,7 +1143,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
      const struct tu_render_pass_attachment *att =
         &cmd->state.pass->attachments[a];
      uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
-      uint32_t gmem_offset = att->gmem_offset;
+      uint32_t gmem_offset = tu_attachment_gmem_offset(cmd, att);
      uint32_t cpp = att->cpp;

      memcpy(dst, iview->view.descriptor, A6XX_TEX_CONST_DWORDS * 4);
@ -1198,7 +1201,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
         dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;

         cpp = att->samples;
-         gmem_offset = att->gmem_offset_stencil;
+         gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout];
      }

      if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
@ -1209,7 +1212,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
      dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
      dst[2] =
         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
-         A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
+         A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
      dst[3] = 0;
      dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
      dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
@ -1336,7 +1339,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                      struct tu_renderpass_result *autotune_result)
 {
   struct tu_physical_device *phys_dev = cmd->device->physical_device;
-
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
   tu_lrz_tiling_begin(cmd, cs);

   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
@ -1344,9 +1347,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,

   tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);

-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
   if (use_hw_binning(cmd)) {
-      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
+      tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
                        A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) |
                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));

@ -1354,7 +1356,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,

      tu6_emit_binning_pass(cmd, cs);

-      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
+      tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
                        A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS |
                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));

@ -1370,14 +1372,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
      tu_cs_emit(cs, 0x1);
   } else {
-      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
+      tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));

-      if (fb->binning_possible) {
+      if (tiling->binning_possible) {
         /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since
          * the actual binner didn't run.
          */
-         int pipe_count = fb->pipe_count.width * fb->pipe_count.height;
+         int pipe_count = tiling->pipe_count.width * tiling->pipe_count.height;
         tu_cs_emit_pkt4(cs, REG_A6XX_VSC_STATE_REG(0), pipe_count);
         for (int i = 0; i < pipe_count; i++)
            tu_cs_emit(cs, ~0);
@ -1453,6 +1455,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                    struct tu_renderpass_result *autotune_result)
 {
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;

   /* Create gmem stores now (at EndRenderPass time)) because they needed to
    * know whether to allow their conditional execution, which was tied to a
@ -1468,19 +1471,19 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
   /* Note: we reverse the order of walking the pipes and tiles on every
    * other row, to improve texture cache locality compared to raster order.
    */
-   for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
-      uint32_t pipe_row = py * fb->pipe_count.width;
-      for (uint32_t pipe_row_i = 0; pipe_row_i < fb->pipe_count.width; pipe_row_i++) {
+   for (uint32_t py = 0; py < tiling->pipe_count.height; py++) {
+      uint32_t pipe_row = py * tiling->pipe_count.width;
+      for (uint32_t pipe_row_i = 0; pipe_row_i < tiling->pipe_count.width; pipe_row_i++) {
         uint32_t px;
         if (py & 1)
-            px = fb->pipe_count.width - 1 - pipe_row_i;
+            px = tiling->pipe_count.width - 1 - pipe_row_i;
         else
            px = pipe_row_i;
         uint32_t pipe = pipe_row + px;
-         uint32_t tx1 = px * fb->pipe0.width;
-         uint32_t ty1 = py * fb->pipe0.height;
-         uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width);
-         uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height);
+         uint32_t tx1 = px * tiling->pipe0.width;
+         uint32_t ty1 = py * tiling->pipe0.height;
+         uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width);
+         uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height);
         uint32_t tile_row_stride = tx2 - tx1;
         uint32_t slot_row = 0;
         for (uint32_t ty = ty1; ty < ty2; ty++) {
@ -1500,7 +1503,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,

   tu6_tile_render_end(cmd, &cmd->cs, autotune_result);

-   trace_end_render_pass(&cmd->trace, &cmd->cs, fb);
+   trace_end_render_pass(&cmd->trace, &cmd->cs, fb, tiling);

   if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
      u_trace_disable_event_range(cmd->trace_renderpass_start,
@ -1526,7 +1529,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,

   tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result);

-   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
+   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer, cmd->state.tiling);
 }

 void
@ -1562,6 +1565,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
   cmd_buffer->state.subpass = NULL;
   cmd_buffer->state.framebuffer = NULL;
   cmd_buffer->state.attachments = NULL;
+   cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
   memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));

   /* LRZ is not valid next time we use it */
@ -1798,6 +1802,7 @@ tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
   memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
   cmd_buffer->state.index_size = 0xff; /* dirty restart index */
   cmd_buffer->state.line_mode = RECTANGULAR;
+   cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* dirty value */

   tu_cache_init(&cmd_buffer->state.cache);
   tu_cache_init(&cmd_buffer->state.renderpass_cache);
@ -1867,6 +1872,12 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
            cmd_buffer->state.subpass =
               &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
         }
+
+         /* We can't set the gmem layout here, because the state.pass only has
+          * to be compatible (same formats/sample counts) with the primary's
+          * renderpass, rather than exactly equal.
+          */
+
         tu_lrz_begin_secondary_cmdbuf(cmd_buffer);
      } else {
         /* When executing in the middle of another command buffer, the CCU
@ -3424,6 +3435,8 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
   cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer;
   cmd->state.attachments = suspended->state.suspended_pass.attachments;
   cmd->state.render_area = suspended->state.suspended_pass.render_area;
+   cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
+   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
   cmd->state.lrz = suspended->state.suspended_pass.lrz;
 }

@ -3866,6 +3879,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
         tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
         cmd->state.framebuffer->attachments[i].attachment;
   }
+   tu_choose_gmem_layout(cmd);

   trace_start_render_pass(&cmd->trace, &cmd->cs);

@ -3970,6 +3984,8 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
      }
   }

+   tu_choose_gmem_layout(cmd);
+
   cmd->state.renderpass_cache.pending_flush_bits =
      cmd->state.cache.pending_flush_bits;
   cmd->state.renderpass_cache.flush_bits = 0;
@ -3999,6 +4015,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
      cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer;
      cmd->state.suspended_pass.render_area = cmd->state.render_area;
      cmd->state.suspended_pass.attachments = cmd->state.attachments;
+      cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
   }

   if (!resuming) {
@ -4078,7 +4095,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,

         tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);

-         if (pass->attachments[a].gmem_offset < 0)
+         if (!pass->attachments[a].gmem)
            continue;

         /* check if the resolved attachment is needed by later subpasses,
--- a/src/freedreno/vulkan/tu_pass.c
+++ b/src/freedreno/vulkan/tu_pass.c
@ -113,7 +113,8 @@ tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,

   if (dep_invalid_for_gmem(dep, src_stage_mask, dst_stage_mask)) {
      perf_debug((struct tu_device *)pass->base.device, "Disabling gmem rendering due to invalid subpass dependency");
-      pass->gmem_pixels = 0;
+      for (int i = 0; i < ARRAY_SIZE(pass->gmem_pixels); i++)
+         pass->gmem_pixels[i] = 0;
   }

   struct tu_subpass_barrier *dst_barrier;
@ -540,103 +541,112 @@ static void
 tu_render_pass_gmem_config(struct tu_render_pass *pass,
                           const struct tu_physical_device *phys_dev)
 {
-   /* From the VK_KHR_multiview spec:
-    *
-    *    Multiview is all-or-nothing for a render pass - that is, either all
-    *    subpasses must have a non-zero view mask (though some subpasses may
-    *    have only one view) or all must be zero.
-    *
-    * This means we only have to check one of the view masks.
-    */
-   if (pass->subpasses[0].multiview_mask) {
-      /* It seems multiview must use sysmem rendering. */
-      pass->gmem_pixels = 0;
-      return;
-   }
+   for (enum tu_gmem_layout layout = 0; layout < TU_GMEM_LAYOUT_COUNT;
+        layout++) {
+      /* From the VK_KHR_multiview spec:
+       *
+       *    Multiview is all-or-nothing for a render pass - that is, either all
+       *    subpasses must have a non-zero view mask (though some subpasses may
+       *    have only one view) or all must be zero.
+       *
+       * This means we only have to check one of the view masks.
+       */
+      if (pass->subpasses[0].multiview_mask) {
+         /* It seems multiview must use sysmem rendering. */
+         pass->gmem_pixels[layout] = 0;
+         continue;
+      }

-   uint32_t block_align_shift = 3; /* log2(gmem_align/(tile_align_w*tile_align_h)) */
-   uint32_t tile_align_w = phys_dev->info->tile_align_w;
-   uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * phys_dev->info->tile_align_h;
+      /* log2(gmem_align/(tile_align_w*tile_align_h)) */
+      uint32_t block_align_shift = 3;
+      uint32_t tile_align_w = phys_dev->info->tile_align_w;
+      uint32_t gmem_align = (1 << block_align_shift) * tile_align_w *
+                            phys_dev->info->tile_align_h;

-   /* calculate total bytes per pixel */
-   uint32_t cpp_total = 0;
-   for (uint32_t i = 0; i < pass->attachment_count; i++) {
-      struct tu_render_pass_attachment *att = &pass->attachments[i];
-      bool cpp1 = (att->cpp == 1);
-      if (att->gmem_offset >= 0) {
-         cpp_total += att->cpp;
+      /* calculate total bytes per pixel */
+      uint32_t cpp_total = 0;
+      for (uint32_t i = 0; i < pass->attachment_count; i++) {
+         struct tu_render_pass_attachment *att = &pass->attachments[i];
+         bool cpp1 = (att->cpp == 1);
+         if (att->gmem) {
+            cpp_total += att->cpp;

-         /* take into account the separate stencil: */
-         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-            cpp1 = (att->samples == 1);
-            cpp_total += att->samples;
-         }
+            /* take into account the separate stencil: */
+            if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+               cpp1 = (att->samples == 1);
+               cpp_total += att->samples;
+            }

-         /* texture pitch must be aligned to 64, use a tile_align_w that is
-          * a multiple of 64 for cpp==1 attachment to work as input attachment
-          */
-         if (cpp1 && tile_align_w % 64 != 0) {
-            tile_align_w *= 2;
-            block_align_shift -= 1;
+            /* texture pitch must be aligned to 64, use a tile_align_w that is
+             * a multiple of 64 for cpp==1 attachment to work as input
+             * attachment
+             */
+            if (cpp1 && tile_align_w % 64 != 0) {
+               tile_align_w *= 2;
+               block_align_shift -= 1;
+            }
         }
      }
-   }

-   pass->tile_align_w = tile_align_w;
+      pass->tile_align_w = tile_align_w;

-   /* no gmem attachments */
-   if (cpp_total == 0) {
-      /* any value non-zero value so tiling config works with no attachments */
-      pass->gmem_pixels = 1024*1024;
-      return;
-   }
-
-   /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path
-    * doesn't break things. maybe there is a better solution?
-    * TODO: this algorithm isn't optimal
-    * for example, two attachments with cpp = {1, 4}
-    * result:  nblocks = {12, 52}, pixels = 196608
-    * optimal: nblocks = {13, 51}, pixels = 208896
-    */
-   uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align;
-   uint32_t offset = 0, pixels = ~0u, i;
-   for (i = 0; i < pass->attachment_count; i++) {
-      struct tu_render_pass_attachment *att = &pass->attachments[i];
-      if (att->gmem_offset < 0)
+      /* no gmem attachments */
+      if (cpp_total == 0) {
+         /* any value non-zero value so tiling config works with no
+          * attachments
+          */
+         pass->gmem_pixels[layout] = 1024 * 1024;
         continue;
+      }

-      att->gmem_offset = offset;
+      /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path
+       * doesn't break things. maybe there is a better solution?
+       * TODO: this algorithm isn't optimal
+       * for example, two attachments with cpp = {1, 4}
+       * result:  nblocks = {12, 52}, pixels = 196608
+       * optimal: nblocks = {13, 51}, pixels = 208896
+       */
+      uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align;
+      uint32_t offset = 0, pixels = ~0u, i;
+      for (i = 0; i < pass->attachment_count; i++) {
+         struct tu_render_pass_attachment *att = &pass->attachments[i];
+         if (!att->gmem)
+            continue;

-      uint32_t align = MAX2(1, att->cpp >> block_align_shift);
-      uint32_t nblocks = MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
+         att->gmem_offset[layout] = offset;

-      if (nblocks > gmem_blocks)
-         break;
+         uint32_t align = MAX2(1, att->cpp >> block_align_shift);
+         uint32_t nblocks =
+            MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);

-      gmem_blocks -= nblocks;
-      cpp_total -= att->cpp;
-      offset += nblocks * gmem_align;
-      pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
-
-      /* repeat the same for separate stencil */
-      if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-         att->gmem_offset_stencil = offset;
-
-         /* note: for s8_uint, block align is always 1 */
-         uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
         if (nblocks > gmem_blocks)
            break;

         gmem_blocks -= nblocks;
-         cpp_total -= att->samples;
+         cpp_total -= att->cpp;
         offset += nblocks * gmem_align;
-         pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
-      }
-   }
+         pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);

-   /* if the loop didn't complete then the gmem config is impossible */
-   if (i == pass->attachment_count)
-      pass->gmem_pixels = pixels;
+         /* repeat the same for separate stencil */
+         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+            att->gmem_offset_stencil[layout] = offset;
+
+            /* note: for s8_uint, block align is always 1 */
+            uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
+            if (nblocks > gmem_blocks)
+               break;
+
+            gmem_blocks -= nblocks;
+            cpp_total -= att->samples;
+            offset += nblocks * gmem_align;
+            pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
+         }
+      }
+
+      /* if the loop didn't complete then the gmem config is impossible */
+      if (i == pass->attachment_count)
+         pass->gmem_pixels[layout] = pixels;
+   }
 }

 static void
@ -737,7 +747,7 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const
 {
   struct tu_subpass *subpass = &pass->subpasses[i];

-   pass->attachments[a].gmem_offset = 0;
+   pass->attachments[a].gmem = true;
   update_samples(subpass, pCreateInfo->pAttachments[a].samples);
   pass->attachments[a].clear_views |= subpass->multiview_mask;
 }
@ -786,7 +796,8 @@ tu_CreateRenderPass2(VkDevice _device,
         att->cpp = 4 * att->samples;
      else
         att->cpp = vk_format_get_blocksize(att->format) * att->samples;
-      att->gmem_offset = -1;
+      /* Initially not allocated into gmem, tu_subpass_use_attachment() will move it there. */
+      att->gmem = false;

      VkAttachmentLoadOp loadOp = pCreateInfo->pAttachments[i].loadOp;
      VkAttachmentLoadOp stencilLoadOp = pCreateInfo->pAttachments[i].stencilLoadOp;
@ -916,7 +927,7 @@ tu_CreateRenderPass2(VkDevice _device,
   /* disable unused attachments */
   for (uint32_t i = 0; i < pass->attachment_count; i++) {
      struct tu_render_pass_attachment *att = &pass->attachments[i];
-      if (att->gmem_offset < 0) {
+      if (!att->gmem) {
         att->clear_mask = 0;
         att->load = false;
      }
@ -1009,7 +1020,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,

      TU_FROM_HANDLE(tu_image_view, view, att_info->imageView);
      tu_setup_dynamic_attachment(att, view);
-      att->gmem_offset = 0;
+      att->gmem = true;
      att->clear_views = info->viewMask;
      attachment_set_ops(device, att, att_info->loadOp, 0,
                         att_info->storeOp, 0);
@ -1024,7 +1035,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
         struct tu_render_pass_attachment *resolve_att = &pass->attachments[a];
         TU_FROM_HANDLE(tu_image_view, resolve_view, att_info->resolveImageView);
         tu_setup_dynamic_attachment(resolve_att, resolve_view);
-         resolve_att->gmem_offset = -1;
+         resolve_att->gmem = false;
         attachment_set_ops(device, resolve_att,
                            VK_ATTACHMENT_LOAD_OP_DONT_CARE, 0,
                            VK_ATTACHMENT_STORE_OP_STORE, 0);
@ -1048,7 +1059,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,

         struct tu_render_pass_attachment *att = &pass->attachments[a];
         tu_setup_dynamic_attachment(att, view);
-         att->gmem_offset = 0;
+         att->gmem = true;
         att->clear_views = info->viewMask;
         subpass->depth_stencil_attachment.attachment = a++;

@ -1066,7 +1077,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
            TU_FROM_HANDLE(tu_image_view, resolve_view,
                           common_info->resolveImageView);
            tu_setup_dynamic_attachment(resolve_att, resolve_view);
-            resolve_att->gmem_offset = -1;
+            resolve_att->gmem = false;
            attachment_set_ops(device, resolve_att,
                               VK_ATTACHMENT_LOAD_OP_DONT_CARE,
                               VK_ATTACHMENT_LOAD_OP_DONT_CARE,
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -942,14 +942,7 @@ struct tu_attachment_info
   struct tu_image_view *attachment;
 };

-struct tu_framebuffer
-{
-   struct vk_object_base base;
-
-   uint32_t width;
-   uint32_t height;
-   uint32_t layers;
-
+struct tu_tiling_config {
   /* size of the first tile */
   VkExtent2D tile0;
   /* number of tiles */
@ -969,6 +962,27 @@ struct tu_framebuffer
   /* pipe register values */
   uint32_t pipe_config[MAX_VSC_PIPES];
   uint32_t pipe_sizes[MAX_VSC_PIPES];
+};
+
+enum tu_gmem_layout
+{
+   /* Use all of GMEM for attachments */
+   TU_GMEM_LAYOUT_FULL,
+   /* Avoid using the region of GMEM that the CCU needs */
+   TU_GMEM_LAYOUT_AVOID_CCU,
+   /* Number of layouts we have, also the value set when we don't know the layout in a secondary. */
+   TU_GMEM_LAYOUT_COUNT,
+};
+
+struct tu_framebuffer
+{
+   struct vk_object_base base;
+
+   uint32_t width;
+   uint32_t height;
+   uint32_t layers;
+
+   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];

   uint32_t attachment_count;
   struct tu_attachment_info attachments[0];
@ -1031,7 +1045,8 @@ struct tu_render_pass_attachment
   uint32_t clear_views;
   bool load;
   bool store;
-   int32_t gmem_offset;
+   bool gmem;
+   int32_t gmem_offset[TU_GMEM_LAYOUT_COUNT];
   bool will_be_resolved;
   /* for D32S8 separate stencil: */
   bool load_stencil;
@ -1040,7 +1055,7 @@ struct tu_render_pass_attachment
   bool cond_load_allowed;
   bool cond_store_allowed;

-   int32_t gmem_offset_stencil;
+   int32_t gmem_offset_stencil[TU_GMEM_LAYOUT_COUNT];
 };

 struct tu_render_pass
@ -1049,7 +1064,7 @@ struct tu_render_pass

   uint32_t attachment_count;
   uint32_t subpass_count;
-   uint32_t gmem_pixels;
+   uint32_t gmem_pixels[TU_GMEM_LAYOUT_COUNT];
   uint32_t tile_align_w;

   /* memory bandwidth costs (in bytes) for gmem / sysmem rendering */
@ -1425,9 +1440,15 @@ struct tu_cmd_state

   enum tu_cmd_ccu_state ccu_state;

+   /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
+    * might get used by tu_store_gmem_attachment().
+    */
+   enum tu_gmem_layout gmem_layout;
+
   const struct tu_render_pass *pass;
   const struct tu_subpass *subpass;
   const struct tu_framebuffer *framebuffer;
+   const struct tu_tiling_config *tiling;
   VkRect2D render_area;

   const struct tu_image_view **attachments;
@ -1442,6 +1463,7 @@ struct tu_cmd_state
      const struct tu_subpass *subpass;
      const struct tu_framebuffer *framebuffer;
      VkRect2D render_area;
+      enum tu_gmem_layout gmem_layout;

      const struct tu_image_view **attachments;

@ -1645,6 +1667,22 @@ struct tu_cmd_buffer
   uint32_t vsc_prim_strm_pitch;
 };

+static inline uint32_t
+tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
+                          const struct tu_render_pass_attachment *att)
+{
+   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
+   return att->gmem_offset[cmd->state.gmem_layout];
+}
+
+static inline uint32_t
+tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
+                                  const struct tu_render_pass_attachment *att)
+{
+   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
+   return att->gmem_offset_stencil[cmd->state.gmem_layout];
+}
+
 /* Temporary struct for tracking a register state to be written, used by
 * a6xx-pack.h and tu_cs_emit_regs()
 */
@ -2054,6 +2092,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                         uint32_t gmem_a,
                         bool cond_exec_allowed);

+void
+tu_choose_gmem_layout(struct tu_cmd_buffer *cmd);
+
 enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);

 struct tu_native_format
--- a/src/freedreno/vulkan/tu_tracepoints.py
+++ b/src/freedreno/vulkan/tu_tracepoints.py
@ -76,14 +76,15 @@ def begin_end_tp(name, args=[], tp_struct=None, tp_print=None,


 begin_end_tp('render_pass',
-    args=[ArgStruct(type='const struct tu_framebuffer *', var='fb')],
+    args=[ArgStruct(type='const struct tu_framebuffer *', var='fb'),
+          ArgStruct(type='const struct tu_tiling_config *', var='tiling')],
    tp_struct=[Arg(type='uint16_t', name='width',        var='fb->width',                                    c_format='%u'),
               Arg(type='uint16_t', name='height',       var='fb->height',                                   c_format='%u'),
               Arg(type='uint8_t',  name='MRTs',         var='fb->attachment_count',                         c_format='%u'),
            #    Arg(type='uint8_t',  name='samples',      var='fb->samples',                                  c_format='%u'),
-               Arg(type='uint16_t', name='numberOfBins', var='fb->tile_count.width * fb->tile_count.height', c_format='%u'),
-               Arg(type='uint16_t', name='binWidth',     var='fb->tile0.width',                              c_format='%u'),
-               Arg(type='uint16_t', name='binHeight',    var='fb->tile0.height',                             c_format='%u')])
+               Arg(type='uint16_t', name='numberOfBins', var='tiling->tile_count.width * tiling->tile_count.height', c_format='%u'),
+               Arg(type='uint16_t', name='binWidth',     var='tiling->tile0.width',                                  c_format='%u'),
+               Arg(type='uint16_t', name='binHeight',    var='tiling->tile0.height',                                 c_format='%u')])

 begin_end_tp('binning_ib')
 begin_end_tp('draw_ib_sysmem')
--- a/src/freedreno/vulkan/tu_util.c
+++ b/src/freedreno/vulkan/tu_util.c
@ -82,19 +82,21 @@ __vk_startup_errorf(struct tu_instance *instance,
 static void
 tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
                                    const struct tu_device *dev,
-                                    const struct tu_render_pass *pass)
+                                    const struct tu_render_pass *pass,
+                                    enum tu_gmem_layout gmem_layout)
 {
   const uint32_t tile_align_w = pass->tile_align_w;
   const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
   const uint32_t max_tile_width = dev->physical_device->info->tile_max_w;
   const uint32_t max_tile_height = dev->physical_device->info->tile_max_h;
+   struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];

   /* start from 1 tile */
-   fb->tile_count = (VkExtent2D) {
+   tiling->tile_count = (VkExtent2D) {
      .width = 1,
      .height = 1,
   };
-   fb->tile0 = (VkExtent2D) {
+   tiling->tile0 = (VkExtent2D) {
      .width = util_align_npot(fb->width, tile_align_w),
      .height = align(fb->height, tile_align_h),
   };
@ -102,138 +104,138 @@ tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
   /* will force to sysmem, don't bother trying to have a valid tile config
    * TODO: just skip all GMEM stuff when sysmem is forced?
    */
-   if (!pass->gmem_pixels)
+   if (!pass->gmem_pixels[gmem_layout])
      return;

   if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
      /* start with 2x2 tiles */
-      fb->tile_count.width = 2;
-      fb->tile_count.height = 2;
-      fb->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w);
-      fb->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h);
+      tiling->tile_count.width = 2;
+      tiling->tile_count.height = 2;
+      tiling->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w);
+      tiling->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h);
   }

   /* do not exceed max tile width */
-   while (fb->tile0.width > max_tile_width) {
-      fb->tile_count.width++;
-      fb->tile0.width =
-         util_align_npot(DIV_ROUND_UP(fb->width, fb->tile_count.width), tile_align_w);
+   while (tiling->tile0.width > max_tile_width) {
+      tiling->tile_count.width++;
+      tiling->tile0.width =
+         util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w);
   }

   /* do not exceed max tile height */
-   while (fb->tile0.height > max_tile_height) {
-      fb->tile_count.height++;
-      fb->tile0.height =
-         util_align_npot(DIV_ROUND_UP(fb->height, fb->tile_count.height), tile_align_h);
+   while (tiling->tile0.height > max_tile_height) {
+      tiling->tile_count.height++;
+      tiling->tile0.height =
+         util_align_npot(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h);
   }

   /* do not exceed gmem size */
-   while (fb->tile0.width * fb->tile0.height > pass->gmem_pixels) {
-      if (fb->tile0.width > MAX2(tile_align_w, fb->tile0.height)) {
-         fb->tile_count.width++;
-         fb->tile0.width =
-            util_align_npot(DIV_ROUND_UP(fb->width, fb->tile_count.width), tile_align_w);
+   while (tiling->tile0.width * tiling->tile0.height > pass->gmem_pixels[gmem_layout]) {
+      if (tiling->tile0.width > MAX2(tile_align_w, tiling->tile0.height)) {
+         tiling->tile_count.width++;
+         tiling->tile0.width =
+            util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w);
      } else {
         /* if this assert fails then layout is impossible.. */
-         assert(fb->tile0.height > tile_align_h);
-         fb->tile_count.height++;
-         fb->tile0.height =
-            align(DIV_ROUND_UP(fb->height, fb->tile_count.height), tile_align_h);
+         assert(tiling->tile0.height > tile_align_h);
+         tiling->tile_count.height++;
+         tiling->tile0.height =
+            align(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h);
      }
   }
 }

 static void
-tu_tiling_config_update_pipe_layout(struct tu_framebuffer *fb,
+tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
                                    const struct tu_device *dev)
 {
   const uint32_t max_pipe_count = 32; /* A6xx */

   /* start from 1 tile per pipe */
-   fb->pipe0 = (VkExtent2D) {
+   tiling->pipe0 = (VkExtent2D) {
      .width = 1,
      .height = 1,
   };
-   fb->pipe_count = fb->tile_count;
+   tiling->pipe_count = tiling->tile_count;

-   while (fb->pipe_count.width * fb->pipe_count.height > max_pipe_count) {
-      if (fb->pipe0.width < fb->pipe0.height) {
-         fb->pipe0.width += 1;
-         fb->pipe_count.width =
-            DIV_ROUND_UP(fb->tile_count.width, fb->pipe0.width);
+   while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
+      if (tiling->pipe0.width < tiling->pipe0.height) {
+         tiling->pipe0.width += 1;
+         tiling->pipe_count.width =
+            DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
      } else {
-         fb->pipe0.height += 1;
-         fb->pipe_count.height =
-            DIV_ROUND_UP(fb->tile_count.height, fb->pipe0.height);
+         tiling->pipe0.height += 1;
+         tiling->pipe_count.height =
+            DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
      }
   }
 }

 static void
-tu_tiling_config_update_pipes(struct tu_framebuffer *fb,
+tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
                              const struct tu_device *dev)
 {
   const uint32_t max_pipe_count = 32; /* A6xx */
   const uint32_t used_pipe_count =
-      fb->pipe_count.width * fb->pipe_count.height;
+      tiling->pipe_count.width * tiling->pipe_count.height;
   const VkExtent2D last_pipe = {
-      .width = (fb->tile_count.width - 1) % fb->pipe0.width + 1,
-      .height = (fb->tile_count.height - 1) % fb->pipe0.height + 1,
+      .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
+      .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
   };

   assert(used_pipe_count <= max_pipe_count);
-   assert(max_pipe_count <= ARRAY_SIZE(fb->pipe_config));
+   assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));

-   for (uint32_t y = 0; y < fb->pipe_count.height; y++) {
-      for (uint32_t x = 0; x < fb->pipe_count.width; x++) {
-         const uint32_t pipe_x = fb->pipe0.width * x;
-         const uint32_t pipe_y = fb->pipe0.height * y;
-         const uint32_t pipe_w = (x == fb->pipe_count.width - 1)
+   for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
+      for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
+         const uint32_t pipe_x = tiling->pipe0.width * x;
+         const uint32_t pipe_y = tiling->pipe0.height * y;
+         const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
                                    ? last_pipe.width
-                                    : fb->pipe0.width;
-         const uint32_t pipe_h = (y == fb->pipe_count.height - 1)
+                                    : tiling->pipe0.width;
+         const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
                                    ? last_pipe.height
-                                    : fb->pipe0.height;
-         const uint32_t n = fb->pipe_count.width * y + x;
+                                    : tiling->pipe0.height;
+         const uint32_t n = tiling->pipe_count.width * y + x;

-         fb->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
+         tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
                                  A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
                                  A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
                                  A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
-         fb->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
+         tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
      }
   }

-   memset(fb->pipe_config + used_pipe_count, 0,
+   memset(tiling->pipe_config + used_pipe_count, 0,
          sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
 }

 static bool
-is_hw_binning_possible(const struct tu_framebuffer *fb)
+is_hw_binning_possible(const struct tu_tiling_config *tiling)
 {
   /* Similar to older gens, # of tiles per pipe cannot be more than 32.
    * But there are no hangs with 16 or more tiles per pipe in either
    * X or Y direction, so that limit does not seem to apply.
    */
-   uint32_t tiles_per_pipe = fb->pipe0.width * fb->pipe0.height;
+   uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
   return tiles_per_pipe <= 32;
 }

 static void
-tu_tiling_config_update_binning(struct tu_framebuffer *fb, const struct tu_device *device)
+tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
 {
-   fb->binning_possible = is_hw_binning_possible(fb);
+   tiling->binning_possible = is_hw_binning_possible(tiling);

-   if (fb->binning_possible) {
-      fb->binning = (fb->tile_count.width * fb->tile_count.height) > 2;
+   if (tiling->binning_possible) {
+      tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;

      if (unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
-         fb->binning = true;
+         tiling->binning = true;
      if (unlikely(device->physical_device->instance->debug_flags &
                   TU_DEBUG_NOBIN))
-         fb->binning = false;
+         tiling->binning = false;
   } else {
-      fb->binning = false;
+      tiling->binning = false;
   }
 }

@ -242,10 +244,13 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
                             const struct tu_device *device,
                             const struct tu_render_pass *pass)
 {
-   tu_tiling_config_update_tile_layout(fb, device, pass);
-   tu_tiling_config_update_pipe_layout(fb, device);
-   tu_tiling_config_update_pipes(fb, device);
-   tu_tiling_config_update_binning(fb, device);
+   for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
+      struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
+      tu_tiling_config_update_tile_layout(fb, device, pass, gmem_layout);
+      tu_tiling_config_update_pipe_layout(tiling, device);
+      tu_tiling_config_update_pipes(tiling, device);
+      tu_tiling_config_update_binning(tiling, device);
+   }
 }

 void