diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c
index 57288379cd8e7..3272f9c5bcebb 100644
--- a/src/freedreno/vulkan/tu_clear_blit.c
+++ b/src/freedreno/vulkan/tu_clear_blit.c
@@ -1039,7 +1039,7 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
    desc[2] =
       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
-      A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
+      A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
    desc[3] = 0;
    desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
    desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
@@ -2717,13 +2717,14 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
    enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, att->gmem_offset, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att), value);
       if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att), value);
       return;
    }
 
-   clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), att->gmem_offset, value);
+   clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask),
+                         tu_attachment_gmem_offset(cmd, att), value);
 
    trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
 }
@@ -2789,12 +2790,15 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
       tu_lrz_disable_during_renderpass(cmd);
    }
 
-   /* vkCmdClearAttachments is supposed to respect the predicate if active.
-    * The easiest way to do this is to always use the 3d path, which always
-    * works even with GMEM because it's just a simple draw using the existing
+   /* vkCmdClearAttachments is supposed to respect the predicate if active. The
+    * easiest way to do this is to always use the 3d path, which always works
+    * even with GMEM because it's just a simple draw using the existing
     * attachment state.
+    *
+    * Similarly, we also use the 3D path when in a secondary command buffer that
+    * doesn't know the GMEM layout that will be chosen by the primary.
     */
-   if (cmd->state.predication_active) {
+   if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
       tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
       return;
    }
@@ -2981,10 +2985,10 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
 
    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
          tu_cs_emit_regs(cs,
-                        A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
+                        A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment)));
    } else {
       tu_cs_emit_regs(cs,
-                     A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
+                     A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment)));
    }
 
    tu6_emit_event_write(cmd, cs, BLIT);
@@ -3156,7 +3160,7 @@ store_cp_blit(struct tu_cmd_buffer *cmd,
                    /* note: src size does not matter when not scaling */
                    A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
                    A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
-                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
+                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp));
 
    /* sync GMEM writes with CACHE. */
    tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
@@ -3243,6 +3247,61 @@ store_3d_blit(struct tu_cmd_buffer *cmd,
                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
 }
 
+static bool
+tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
+{
+   struct tu_physical_device *phys_dev = cmd->device->physical_device;
+   const struct tu_image_view *iview = cmd->state.attachments[a];
+   const VkRect2D *render_area = &cmd->state.render_area;
+
+   /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
+   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
+      return true;
+
+   uint32_t x1 = render_area->offset.x;
+   uint32_t y1 = render_area->offset.y;
+   uint32_t x2 = x1 + render_area->extent.width;
+   uint32_t y2 = y1 + render_area->extent.height;
+   /* x2/y2 can be unaligned if equal to the size of the image, since it will
+    * write into padding space. The one exception is linear levels which don't
+    * have the required y padding in the layout (except for the last level)
+    */
+   bool need_y2_align =
+      y2 != iview->view.height || iview->view.need_y2_align;
+
+   return (x1 % phys_dev->info->gmem_align_w ||
+           (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
+           y1 % phys_dev->info->gmem_align_h ||
+           (y2 % phys_dev->info->gmem_align_h && need_y2_align));
+}
+
+/* Choose the GMEM layout (use the CCU space or not) based on whether the
+ * current attachments will need.  This has to happen at vkBeginRenderPass()
+ * time because tu_attachment_store_unaligned() looks at the image views, which
+ * are only available at that point.  This should match the logic for the
+ * !unaligned case in tu_store_gmem_attachment().
+ */
+void
+tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
+{
+   cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
+
+   for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
+      if (!cmd->state.attachments[i])
+         continue;
+
+      struct tu_render_pass_attachment *att =
+         &cmd->state.pass->attachments[i];
+      if ((att->store || att->store_stencil) &&
+          tu_attachment_store_unaligned(cmd, i))
+         cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
+      if (att->will_be_resolved && !blit_can_resolve(att->format))
+         cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
+   }
+
+   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
+}
+
 void
 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                          struct tu_cs *cs,
@@ -3250,7 +3309,6 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                          uint32_t gmem_a,
                          bool cond_exec_allowed)
 {
-   struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const VkRect2D *render_area = &cmd->state.render_area;
    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
    const struct tu_image_view *iview = cmd->state.attachments[a];
@@ -3267,26 +3325,7 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       tu_begin_load_store_cond_exec(cmd, cs, false);
    }
 
-   uint32_t x1 = render_area->offset.x;
-   uint32_t y1 = render_area->offset.y;
-   uint32_t x2 = x1 + render_area->extent.width;
-   uint32_t y2 = y1 + render_area->extent.height;
-   /* x2/y2 can be unaligned if equal to the size of the image,
-    * since it will write into padding space
-    * the one exception is linear levels which don't have the
-    * required y padding in the layout (except for the last level)
-    */
-   bool need_y2_align =
-      y2 != iview->view.height || iview->view.need_y2_align;
-
-   bool unaligned =
-      x1 % phys_dev->info->gmem_align_w ||
-      (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
-      y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);
-
-   /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
-   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE))
-      unaligned = true;
+   bool unaligned = tu_attachment_store_unaligned(cmd, a);
 
    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
     * one for depth and other for stencil. When resolving a MSAA
@@ -3324,6 +3363,8 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       return;
    }
 
+   assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
+
    enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format);
    if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
       src_format = PIPE_FORMAT_Z32_FLOAT;
@@ -3345,23 +3386,23 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
 
       if (store_common) {
          store_3d_blit(cmd, cs, iview, dst->samples, false, src_format,
-                       dst_format, render_area, src->gmem_offset, src->cpp);
+                       dst_format, render_area, tu_attachment_gmem_offset(cmd, src), src->cpp);
       }
       if (store_separate_stencil) {
          store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
                        PIPE_FORMAT_S8_UINT, render_area,
-                       src->gmem_offset_stencil, src->samples);
+                       tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
       }
    } else {
       r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
 
       if (store_common) {
          store_cp_blit(cmd, cs, iview, src->samples, false, src_format,
-                       dst_format, src->gmem_offset, src->cpp);
+                       dst_format, tu_attachment_gmem_offset(cmd, src), src->cpp);
       }
       if (store_separate_stencil) {
          store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
-                       PIPE_FORMAT_S8_UINT, src->gmem_offset_stencil, src->samples);
+                       PIPE_FORMAT_S8_UINT, tu_attachment_gmem_offset_stencil(cmd, src), src->samples);
       }
    }
 
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index d6e2608ce1b84..b89e03abd1ab8 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -235,7 +235,7 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
       tu_cs_image_depth_ref(cs, iview, 0);
    else
       tu_cs_image_ref(cs, &iview->view, 0);
-   tu_cs_emit(cs, attachment->gmem_offset);
+   tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment));
 
    tu_cs_emit_regs(cs,
                    A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
@@ -250,10 +250,10 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
       tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
       if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
          tu_cs_image_stencil_ref(cs, iview, 0);
-         tu_cs_emit(cs, attachment->gmem_offset_stencil);
+         tu_cs_emit(cs, tu_attachment_gmem_offset_stencil(cmd, attachment));
       } else {
          tu_cs_image_ref(cs, &iview->view, 0);
-         tu_cs_emit(cs, attachment->gmem_offset);
+         tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment));
       }
    } else {
       tu_cs_emit_regs(cs,
@@ -294,7 +294,7 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
       tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
       tu_cs_emit(cs, iview->view.RB_MRT_BUF_INFO);
       tu_cs_image_ref(cs, &iview->view, 0);
-      tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
+      tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, &cmd->state.pass->attachments[a]));
 
       tu_cs_emit_regs(cs,
                       A6XX_SP_FS_MRT_REG(i, .dword = iview->view.SP_FS_MRT_REG));
@@ -565,6 +565,7 @@ static bool
 use_hw_binning(struct tu_cmd_buffer *cmd)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
 
    /* XFB commands are emitted for BINNING || SYSMEM, which makes it
     * incompatible with non-hw binning GMEM rendering. this is required because
@@ -573,7 +574,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
     * XFB was used.
     */
    if (cmd->state.rp.xfb_used) {
-      assert(fb->binning_possible);
+      assert(tiling->binning_possible);
       return true;
    }
 
@@ -584,11 +585,11 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
     */
    if (cmd->state.rp.has_prim_generated_query_in_rp ||
        cmd->state.prim_generated_query_running_before_rp) {
-      assert(fb->binning_possible);
+      assert(tiling->binning_possible);
       return true;
    }
 
-   return fb->binning;
+   return tiling->binning;
 }
 
 static bool
@@ -599,7 +600,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
       return true;
 
    /* can't fit attachments into gmem */
-   if (!cmd->state.pass->gmem_pixels)
+   if (!cmd->state.pass->gmem_pixels[cmd->state.gmem_layout])
       return true;
 
    if (cmd->state.framebuffer->layers > 1)
@@ -617,7 +618,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
       return true;
 
    /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
-   if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible)
+   if (cmd->state.rp.xfb_used && !cmd->state.tiling->binning_possible)
       return true;
 
    /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
@@ -625,7 +626,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
     */
    if ((cmd->state.rp.has_prim_generated_query_in_rp ||
         cmd->state.prim_generated_query_running_before_rp) &&
-       !cmd->state.framebuffer->binning_possible)
+       !cmd->state.tiling->binning_possible)
       return true;
 
    if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_GMEM))
@@ -649,7 +650,7 @@ static void
 tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                               uint32_t pipe, uint32_t slot, bool wfm)
 {
-   if (cmd->state.framebuffer->binning_possible) {
+   if (cmd->state.tiling->binning_possible) {
       tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
       tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) |
                      A6XX_CP_REG_TEST_0_BIT(slot) |
@@ -664,15 +665,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
                      uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
 
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
 
-   const uint32_t x1 = fb->tile0.width * tx;
-   const uint32_t y1 = fb->tile0.height * ty;
-   const uint32_t x2 = MIN2(x1 + fb->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
-   const uint32_t y2 = MIN2(y1 + fb->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
+   const uint32_t x1 = tiling->tile0.width * tx;
+   const uint32_t y1 = tiling->tile0.height * ty;
+   const uint32_t x2 = MIN2(x1 + tiling->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
+   const uint32_t y2 = MIN2(y1 + tiling->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
    tu6_emit_window_scissor(cs, x1, y1, x2, y2);
    tu6_emit_window_offset(cs, x1, y1);
 
@@ -685,7 +686,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
       tu_cs_emit(cs, 0x0);
 
       tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
-      tu_cs_emit(cs, fb->pipe_sizes[pipe] |
+      tu_cs_emit(cs, tiling->pipe_sizes[pipe] |
                      CP_SET_BIN_DATA5_0_VSC_N(slot));
       tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
       tu_cs_emit(cs, pipe * 4);
@@ -769,7 +770,7 @@ tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu6_emit_blit_scissor(cmd, cs, true);
 
    for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
-      tu_load_gmem_attachment(cmd, cs, i, cmd->state.framebuffer->binning, false);
+      tu_load_gmem_attachment(cmd, cs, i, cmd->state.tiling->binning, false);
 }
 
 static void
@@ -787,8 +788,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu6_emit_blit_scissor(cmd, cs, true);
 
    for (uint32_t a = 0; a < pass->attachment_count; ++a) {
-      if (pass->attachments[a].gmem_offset >= 0)
-         tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.framebuffer->binning_possible);
+      if (pass->attachments[a].gmem)
+         tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.tiling->binning_possible);
    }
 
    if (subpass->resolve_attachments) {
@@ -965,18 +966,18 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 static void
 update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
 
    tu_cs_emit_regs(cs,
-                   A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
-                                     .height = fb->tile0.height));
+                   A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width,
+                                     .height = tiling->tile0.height));
 
    tu_cs_emit_regs(cs,
-                   A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
-                                      .ny = fb->tile_count.height));
+                   A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
+                                      .ny = tiling->tile_count.height));
 
    tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
-   tu_cs_emit_array(cs, fb->pipe_config, 32);
+   tu_cs_emit_array(cs, tiling->pipe_config, 32);
 
    tu_cs_emit_regs(cs,
                    A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
@@ -990,9 +991,9 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 static void
 emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
    const uint32_t used_pipe_count =
-      fb->pipe_count.width * fb->pipe_count.height;
+      tiling->pipe_count.width * tiling->pipe_count.height;
 
    for (int i = 0; i < used_pipe_count; i++) {
       tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
@@ -1110,6 +1111,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
                           const struct tu_subpass *subpass,
                           bool gmem)
 {
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
+
    /* note: we can probably emit input attachments just once for the whole
     * renderpass, this would avoid emitting both sysmem/gmem versions
     *
@@ -1140,7 +1143,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
       const struct tu_render_pass_attachment *att =
          &cmd->state.pass->attachments[a];
       uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
-      uint32_t gmem_offset = att->gmem_offset;
+      uint32_t gmem_offset = tu_attachment_gmem_offset(cmd, att);
       uint32_t cpp = att->cpp;
 
       memcpy(dst, iview->view.descriptor, A6XX_TEX_CONST_DWORDS * 4);
@@ -1198,7 +1201,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
          dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;
 
          cpp = att->samples;
-         gmem_offset = att->gmem_offset_stencil;
+         gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout];
       }
 
       if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
@@ -1209,7 +1212,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
       dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
       dst[2] =
          A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
-         A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
+         A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
       dst[3] = 0;
       dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
       dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
@@ -1336,7 +1339,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
                       struct tu_renderpass_result *autotune_result)
 {
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
-
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
    tu_lrz_tiling_begin(cmd, cs);
 
    tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
@@ -1344,9 +1347,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
 
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
    if (use_hw_binning(cmd)) {
-      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
+      tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
                         A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) |
                         A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
 
@@ -1354,7 +1356,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
       tu6_emit_binning_pass(cmd, cs);
 
-      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
+      tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
                         A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS |
                         A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
 
@@ -1370,14 +1372,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
       tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
       tu_cs_emit(cs, 0x1);
    } else {
-      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
+      tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
                         A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
 
-      if (fb->binning_possible) {
+      if (tiling->binning_possible) {
          /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since
           * the actual binner didn't run.
           */
-         int pipe_count = fb->pipe_count.width * fb->pipe_count.height;
+         int pipe_count = tiling->pipe_count.width * tiling->pipe_count.height;
          tu_cs_emit_pkt4(cs, REG_A6XX_VSC_STATE_REG(0), pipe_count);
          for (int i = 0; i < pipe_count; i++)
             tu_cs_emit(cs, ~0);
@@ -1453,6 +1455,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
                     struct tu_renderpass_result *autotune_result)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
+   const struct tu_tiling_config *tiling = cmd->state.tiling;
 
    /* Create gmem stores now (at EndRenderPass time)) because they needed to
     * know whether to allow their conditional execution, which was tied to a
@@ -1468,19 +1471,19 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
    /* Note: we reverse the order of walking the pipes and tiles on every
     * other row, to improve texture cache locality compared to raster order.
     */
-   for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
-      uint32_t pipe_row = py * fb->pipe_count.width;
-      for (uint32_t pipe_row_i = 0; pipe_row_i < fb->pipe_count.width; pipe_row_i++) {
+   for (uint32_t py = 0; py < tiling->pipe_count.height; py++) {
+      uint32_t pipe_row = py * tiling->pipe_count.width;
+      for (uint32_t pipe_row_i = 0; pipe_row_i < tiling->pipe_count.width; pipe_row_i++) {
          uint32_t px;
          if (py & 1)
-            px = fb->pipe_count.width - 1 - pipe_row_i;
+            px = tiling->pipe_count.width - 1 - pipe_row_i;
          else
             px = pipe_row_i;
          uint32_t pipe = pipe_row + px;
-         uint32_t tx1 = px * fb->pipe0.width;
-         uint32_t ty1 = py * fb->pipe0.height;
-         uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width);
-         uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height);
+         uint32_t tx1 = px * tiling->pipe0.width;
+         uint32_t ty1 = py * tiling->pipe0.height;
+         uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width);
+         uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height);
          uint32_t tile_row_stride = tx2 - tx1;
          uint32_t slot_row = 0;
          for (uint32_t ty = ty1; ty < ty2; ty++) {
@@ -1500,7 +1503,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
 
    tu6_tile_render_end(cmd, &cmd->cs, autotune_result);
 
-   trace_end_render_pass(&cmd->trace, &cmd->cs, fb);
+   trace_end_render_pass(&cmd->trace, &cmd->cs, fb, tiling);
 
    if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
       u_trace_disable_event_range(cmd->trace_renderpass_start,
@@ -1526,7 +1529,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
 
    tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result);
 
-   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
+   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer, cmd->state.tiling);
 }
 
 void
@@ -1562,6 +1565,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
    cmd_buffer->state.subpass = NULL;
    cmd_buffer->state.framebuffer = NULL;
    cmd_buffer->state.attachments = NULL;
+   cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
    memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
 
    /* LRZ is not valid next time we use it */
@@ -1798,6 +1802,7 @@ tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
    cmd_buffer->state.index_size = 0xff; /* dirty restart index */
    cmd_buffer->state.line_mode = RECTANGULAR;
+   cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* dirty value */
 
    tu_cache_init(&cmd_buffer->state.cache);
    tu_cache_init(&cmd_buffer->state.renderpass_cache);
@@ -1867,6 +1872,12 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
             cmd_buffer->state.subpass =
                &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
          }
+
+         /* We can't set the gmem layout here, because the state.pass only has
+          * to be compatible (same formats/sample counts) with the primary's
+          * renderpass, rather than exactly equal.
+          */
+
          tu_lrz_begin_secondary_cmdbuf(cmd_buffer);
       } else {
          /* When executing in the middle of another command buffer, the CCU
@@ -3424,6 +3435,8 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
    cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer;
    cmd->state.attachments = suspended->state.suspended_pass.attachments;
    cmd->state.render_area = suspended->state.suspended_pass.render_area;
+   cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
+   cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
    cmd->state.lrz = suspended->state.suspended_pass.lrz;
 }
 
@@ -3866,6 +3879,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
          tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
          cmd->state.framebuffer->attachments[i].attachment;
    }
+   tu_choose_gmem_layout(cmd);
 
    trace_start_render_pass(&cmd->trace, &cmd->cs);
 
@@ -3970,6 +3984,8 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
       }
    }
 
+   tu_choose_gmem_layout(cmd);
+
    cmd->state.renderpass_cache.pending_flush_bits =
       cmd->state.cache.pending_flush_bits;
    cmd->state.renderpass_cache.flush_bits = 0;
@@ -3999,6 +4015,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
       cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer;
       cmd->state.suspended_pass.render_area = cmd->state.render_area;
       cmd->state.suspended_pass.attachments = cmd->state.attachments;
+      cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
    }
 
    if (!resuming) {
@@ -4078,7 +4095,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
 
          tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);
 
-         if (pass->attachments[a].gmem_offset < 0)
+         if (!pass->attachments[a].gmem)
             continue;
 
          /* check if the resolved attachment is needed by later subpasses,
diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c
index 2ecbb47527baa..27af02469ff02 100644
--- a/src/freedreno/vulkan/tu_pass.c
+++ b/src/freedreno/vulkan/tu_pass.c
@@ -113,7 +113,8 @@ tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
 
    if (dep_invalid_for_gmem(dep, src_stage_mask, dst_stage_mask)) {
       perf_debug((struct tu_device *)pass->base.device, "Disabling gmem rendering due to invalid subpass dependency");
-      pass->gmem_pixels = 0;
+      for (int i = 0; i < ARRAY_SIZE(pass->gmem_pixels); i++)
+         pass->gmem_pixels[i] = 0;
    }
 
    struct tu_subpass_barrier *dst_barrier;
@@ -540,103 +541,112 @@ static void
 tu_render_pass_gmem_config(struct tu_render_pass *pass,
                            const struct tu_physical_device *phys_dev)
 {
-   /* From the VK_KHR_multiview spec:
-    *
-    *    Multiview is all-or-nothing for a render pass - that is, either all
-    *    subpasses must have a non-zero view mask (though some subpasses may
-    *    have only one view) or all must be zero.
-    *
-    * This means we only have to check one of the view masks.
-    */
-   if (pass->subpasses[0].multiview_mask) {
-      /* It seems multiview must use sysmem rendering. */
-      pass->gmem_pixels = 0;
-      return;
-   }
+   for (enum tu_gmem_layout layout = 0; layout < TU_GMEM_LAYOUT_COUNT;
+        layout++) {
+      /* From the VK_KHR_multiview spec:
+       *
+       *    Multiview is all-or-nothing for a render pass - that is, either all
+       *    subpasses must have a non-zero view mask (though some subpasses may
+       *    have only one view) or all must be zero.
+       *
+       * This means we only have to check one of the view masks.
+       */
+      if (pass->subpasses[0].multiview_mask) {
+         /* It seems multiview must use sysmem rendering. */
+         pass->gmem_pixels[layout] = 0;
+         continue;
+      }
 
-   uint32_t block_align_shift = 3; /* log2(gmem_align/(tile_align_w*tile_align_h)) */
-   uint32_t tile_align_w = phys_dev->info->tile_align_w;
-   uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * phys_dev->info->tile_align_h;
+      /* log2(gmem_align/(tile_align_w*tile_align_h)) */
+      uint32_t block_align_shift = 3;
+      uint32_t tile_align_w = phys_dev->info->tile_align_w;
+      uint32_t gmem_align = (1 << block_align_shift) * tile_align_w *
+                            phys_dev->info->tile_align_h;
 
-   /* calculate total bytes per pixel */
-   uint32_t cpp_total = 0;
-   for (uint32_t i = 0; i < pass->attachment_count; i++) {
-      struct tu_render_pass_attachment *att = &pass->attachments[i];
-      bool cpp1 = (att->cpp == 1);
-      if (att->gmem_offset >= 0) {
-         cpp_total += att->cpp;
+      /* calculate total bytes per pixel */
+      uint32_t cpp_total = 0;
+      for (uint32_t i = 0; i < pass->attachment_count; i++) {
+         struct tu_render_pass_attachment *att = &pass->attachments[i];
+         bool cpp1 = (att->cpp == 1);
+         if (att->gmem) {
+            cpp_total += att->cpp;
 
-         /* take into account the separate stencil: */
-         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-            cpp1 = (att->samples == 1);
-            cpp_total += att->samples;
-         }
+            /* take into account the separate stencil: */
+            if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+               cpp1 = (att->samples == 1);
+               cpp_total += att->samples;
+            }
 
-         /* texture pitch must be aligned to 64, use a tile_align_w that is
-          * a multiple of 64 for cpp==1 attachment to work as input attachment
-          */
-         if (cpp1 && tile_align_w % 64 != 0) {
-            tile_align_w *= 2;
-            block_align_shift -= 1;
+            /* texture pitch must be aligned to 64, use a tile_align_w that is
+             * a multiple of 64 for cpp==1 attachment to work as input
+             * attachment
+             */
+            if (cpp1 && tile_align_w % 64 != 0) {
+               tile_align_w *= 2;
+               block_align_shift -= 1;
+            }
          }
       }
-   }
 
-   pass->tile_align_w = tile_align_w;
+      pass->tile_align_w = tile_align_w;
 
-   /* no gmem attachments */
-   if (cpp_total == 0) {
-      /* any value non-zero value so tiling config works with no attachments */
-      pass->gmem_pixels = 1024*1024;
-      return;
-   }
-
-   /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path
-    * doesn't break things. maybe there is a better solution?
-    * TODO: this algorithm isn't optimal
-    * for example, two attachments with cpp = {1, 4}
-    * result:  nblocks = {12, 52}, pixels = 196608
-    * optimal: nblocks = {13, 51}, pixels = 208896
-    */
-   uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align;
-   uint32_t offset = 0, pixels = ~0u, i;
-   for (i = 0; i < pass->attachment_count; i++) {
-      struct tu_render_pass_attachment *att = &pass->attachments[i];
-      if (att->gmem_offset < 0)
+      /* no gmem attachments */
+      if (cpp_total == 0) {
+         /* any value non-zero value so tiling config works with no
+          * attachments
+          */
+         pass->gmem_pixels[layout] = 1024 * 1024;
          continue;
+      }
 
-      att->gmem_offset = offset;
+      /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path
+       * doesn't break things. maybe there is a better solution?
+       * TODO: this algorithm isn't optimal
+       * for example, two attachments with cpp = {1, 4}
+       * result:  nblocks = {12, 52}, pixels = 196608
+       * optimal: nblocks = {13, 51}, pixels = 208896
+       */
+      uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align;
+      uint32_t offset = 0, pixels = ~0u, i;
+      for (i = 0; i < pass->attachment_count; i++) {
+         struct tu_render_pass_attachment *att = &pass->attachments[i];
+         if (!att->gmem)
+            continue;
 
-      uint32_t align = MAX2(1, att->cpp >> block_align_shift);
-      uint32_t nblocks = MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
+         att->gmem_offset[layout] = offset;
 
-      if (nblocks > gmem_blocks)
-         break;
+         uint32_t align = MAX2(1, att->cpp >> block_align_shift);
+         uint32_t nblocks =
+            MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
 
-      gmem_blocks -= nblocks;
-      cpp_total -= att->cpp;
-      offset += nblocks * gmem_align;
-      pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
-
-      /* repeat the same for separate stencil */
-      if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-         att->gmem_offset_stencil = offset;
-
-         /* note: for s8_uint, block align is always 1 */
-         uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
          if (nblocks > gmem_blocks)
             break;
 
          gmem_blocks -= nblocks;
-         cpp_total -= att->samples;
+         cpp_total -= att->cpp;
          offset += nblocks * gmem_align;
-         pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
-      }
-   }
+         pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
 
-   /* if the loop didn't complete then the gmem config is impossible */
-   if (i == pass->attachment_count)
-      pass->gmem_pixels = pixels;
+         /* repeat the same for separate stencil */
+         if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+            att->gmem_offset_stencil[layout] = offset;
+
+            /* note: for s8_uint, block align is always 1 */
+            uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
+            if (nblocks > gmem_blocks)
+               break;
+
+            gmem_blocks -= nblocks;
+            cpp_total -= att->samples;
+            offset += nblocks * gmem_align;
+            pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
+         }
+      }
+
+      /* if the loop didn't complete then the gmem config is impossible */
+      if (i == pass->attachment_count)
+         pass->gmem_pixels[layout] = pixels;
+   }
 }
 
 static void
@@ -737,7 +747,7 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const
 {
    struct tu_subpass *subpass = &pass->subpasses[i];
 
-   pass->attachments[a].gmem_offset = 0;
+   pass->attachments[a].gmem = true;
    update_samples(subpass, pCreateInfo->pAttachments[a].samples);
    pass->attachments[a].clear_views |= subpass->multiview_mask;
 }
@@ -786,7 +796,8 @@ tu_CreateRenderPass2(VkDevice _device,
          att->cpp = 4 * att->samples;
       else
          att->cpp = vk_format_get_blocksize(att->format) * att->samples;
-      att->gmem_offset = -1;
+      /* Initially not allocated into gmem, tu_subpass_use_attachment() will move it there. */
+      att->gmem = false;
 
       VkAttachmentLoadOp loadOp = pCreateInfo->pAttachments[i].loadOp;
       VkAttachmentLoadOp stencilLoadOp = pCreateInfo->pAttachments[i].stencilLoadOp;
@@ -916,7 +927,7 @@ tu_CreateRenderPass2(VkDevice _device,
    /* disable unused attachments */
    for (uint32_t i = 0; i < pass->attachment_count; i++) {
       struct tu_render_pass_attachment *att = &pass->attachments[i];
-      if (att->gmem_offset < 0) {
+      if (!att->gmem) {
          att->clear_mask = 0;
          att->load = false;
       }
@@ -1009,7 +1020,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
 
       TU_FROM_HANDLE(tu_image_view, view, att_info->imageView);
       tu_setup_dynamic_attachment(att, view);
-      att->gmem_offset = 0;
+      att->gmem = true;
       att->clear_views = info->viewMask;
       attachment_set_ops(device, att, att_info->loadOp, 0,
                          att_info->storeOp, 0);
@@ -1024,7 +1035,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
          struct tu_render_pass_attachment *resolve_att = &pass->attachments[a];
          TU_FROM_HANDLE(tu_image_view, resolve_view, att_info->resolveImageView);
          tu_setup_dynamic_attachment(resolve_att, resolve_view);
-         resolve_att->gmem_offset = -1;
+         resolve_att->gmem = false;
          attachment_set_ops(device, resolve_att,
                             VK_ATTACHMENT_LOAD_OP_DONT_CARE, 0,
                             VK_ATTACHMENT_STORE_OP_STORE, 0);
@@ -1048,7 +1059,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
 
          struct tu_render_pass_attachment *att = &pass->attachments[a];
          tu_setup_dynamic_attachment(att, view);
-         att->gmem_offset = 0;
+         att->gmem = true;
          att->clear_views = info->viewMask;
          subpass->depth_stencil_attachment.attachment = a++;
 
@@ -1066,7 +1077,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
             TU_FROM_HANDLE(tu_image_view, resolve_view,
                            common_info->resolveImageView);
             tu_setup_dynamic_attachment(resolve_att, resolve_view);
-            resolve_att->gmem_offset = -1;
+            resolve_att->gmem = false;
             attachment_set_ops(device, resolve_att,
                                VK_ATTACHMENT_LOAD_OP_DONT_CARE,
                                VK_ATTACHMENT_LOAD_OP_DONT_CARE,
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index a68b712f12d04..ee075b6a963af 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -942,14 +942,7 @@ struct tu_attachment_info
    struct tu_image_view *attachment;
 };
 
-struct tu_framebuffer
-{
-   struct vk_object_base base;
-
-   uint32_t width;
-   uint32_t height;
-   uint32_t layers;
-
+struct tu_tiling_config {
    /* size of the first tile */
    VkExtent2D tile0;
    /* number of tiles */
@@ -969,6 +962,27 @@ struct tu_framebuffer
    /* pipe register values */
    uint32_t pipe_config[MAX_VSC_PIPES];
    uint32_t pipe_sizes[MAX_VSC_PIPES];
+};
+
+enum tu_gmem_layout
+{
+   /* Use all of GMEM for attachments */
+   TU_GMEM_LAYOUT_FULL,
+   /* Avoid using the region of GMEM that the CCU needs */
+   TU_GMEM_LAYOUT_AVOID_CCU,
+   /* Number of layouts we have, also the value set when we don't know the layout in a secondary. */
+   TU_GMEM_LAYOUT_COUNT,
+};
+
+struct tu_framebuffer
+{
+   struct vk_object_base base;
+
+   uint32_t width;
+   uint32_t height;
+   uint32_t layers;
+
+   struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
 
    uint32_t attachment_count;
    struct tu_attachment_info attachments[0];
@@ -1031,7 +1045,8 @@ struct tu_render_pass_attachment
    uint32_t clear_views;
    bool load;
    bool store;
-   int32_t gmem_offset;
+   bool gmem;
+   int32_t gmem_offset[TU_GMEM_LAYOUT_COUNT];
    bool will_be_resolved;
    /* for D32S8 separate stencil: */
    bool load_stencil;
@@ -1040,7 +1055,7 @@ struct tu_render_pass_attachment
    bool cond_load_allowed;
    bool cond_store_allowed;
 
-   int32_t gmem_offset_stencil;
+   int32_t gmem_offset_stencil[TU_GMEM_LAYOUT_COUNT];
 };
 
 struct tu_render_pass
@@ -1049,7 +1064,7 @@ struct tu_render_pass
 
    uint32_t attachment_count;
    uint32_t subpass_count;
-   uint32_t gmem_pixels;
+   uint32_t gmem_pixels[TU_GMEM_LAYOUT_COUNT];
    uint32_t tile_align_w;
 
    /* memory bandwidth costs (in bytes) for gmem / sysmem rendering */
@@ -1425,9 +1440,15 @@ struct tu_cmd_state
 
    enum tu_cmd_ccu_state ccu_state;
 
+   /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
+    * might get used by tu_store_gmem_attachment().
+    */
+   enum tu_gmem_layout gmem_layout;
+
    const struct tu_render_pass *pass;
    const struct tu_subpass *subpass;
    const struct tu_framebuffer *framebuffer;
+   const struct tu_tiling_config *tiling;
    VkRect2D render_area;
 
    const struct tu_image_view **attachments;
@@ -1442,6 +1463,7 @@ struct tu_cmd_state
       const struct tu_subpass *subpass;
       const struct tu_framebuffer *framebuffer;
       VkRect2D render_area;
+      enum tu_gmem_layout gmem_layout;
 
       const struct tu_image_view **attachments;
 
@@ -1645,6 +1667,22 @@ struct tu_cmd_buffer
    uint32_t vsc_prim_strm_pitch;
 };
 
+static inline uint32_t
+tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
+                          const struct tu_render_pass_attachment *att)
+{
+   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
+   return att->gmem_offset[cmd->state.gmem_layout];
+}
+
+static inline uint32_t
+tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
+                                  const struct tu_render_pass_attachment *att)
+{
+   assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
+   return att->gmem_offset_stencil[cmd->state.gmem_layout];
+}
+
 /* Temporary struct for tracking a register state to be written, used by
  * a6xx-pack.h and tu_cs_emit_regs()
  */
@@ -2054,6 +2092,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                          uint32_t gmem_a,
                          bool cond_exec_allowed);
 
+void
+tu_choose_gmem_layout(struct tu_cmd_buffer *cmd);
+
 enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
 
 struct tu_native_format
diff --git a/src/freedreno/vulkan/tu_tracepoints.py b/src/freedreno/vulkan/tu_tracepoints.py
index ba65a6ae64543..9b3049285f6fa 100644
--- a/src/freedreno/vulkan/tu_tracepoints.py
+++ b/src/freedreno/vulkan/tu_tracepoints.py
@@ -76,14 +76,15 @@ def begin_end_tp(name, args=[], tp_struct=None, tp_print=None,
 
 
 begin_end_tp('render_pass',
-    args=[ArgStruct(type='const struct tu_framebuffer *', var='fb')],
+    args=[ArgStruct(type='const struct tu_framebuffer *', var='fb'),
+          ArgStruct(type='const struct tu_tiling_config *', var='tiling')],
     tp_struct=[Arg(type='uint16_t', name='width',        var='fb->width',                                    c_format='%u'),
                Arg(type='uint16_t', name='height',       var='fb->height',                                   c_format='%u'),
                Arg(type='uint8_t',  name='MRTs',         var='fb->attachment_count',                         c_format='%u'),
             #    Arg(type='uint8_t',  name='samples',      var='fb->samples',                                  c_format='%u'),
-               Arg(type='uint16_t', name='numberOfBins', var='fb->tile_count.width * fb->tile_count.height', c_format='%u'),
-               Arg(type='uint16_t', name='binWidth',     var='fb->tile0.width',                              c_format='%u'),
-               Arg(type='uint16_t', name='binHeight',    var='fb->tile0.height',                             c_format='%u')])
+               Arg(type='uint16_t', name='numberOfBins', var='tiling->tile_count.width * tiling->tile_count.height', c_format='%u'),
+               Arg(type='uint16_t', name='binWidth',     var='tiling->tile0.width',                                  c_format='%u'),
+               Arg(type='uint16_t', name='binHeight',    var='tiling->tile0.height',                                 c_format='%u')])
 
 begin_end_tp('binning_ib')
 begin_end_tp('draw_ib_sysmem')
diff --git a/src/freedreno/vulkan/tu_util.c b/src/freedreno/vulkan/tu_util.c
index 5be9100702b19..970c8a4fe676b 100644
--- a/src/freedreno/vulkan/tu_util.c
+++ b/src/freedreno/vulkan/tu_util.c
@@ -82,19 +82,21 @@ __vk_startup_errorf(struct tu_instance *instance,
 static void
 tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
                                     const struct tu_device *dev,
-                                    const struct tu_render_pass *pass)
+                                    const struct tu_render_pass *pass,
+                                    enum tu_gmem_layout gmem_layout)
 {
    const uint32_t tile_align_w = pass->tile_align_w;
    const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
    const uint32_t max_tile_width = dev->physical_device->info->tile_max_w;
    const uint32_t max_tile_height = dev->physical_device->info->tile_max_h;
+   struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
 
    /* start from 1 tile */
-   fb->tile_count = (VkExtent2D) {
+   tiling->tile_count = (VkExtent2D) {
       .width = 1,
       .height = 1,
    };
-   fb->tile0 = (VkExtent2D) {
+   tiling->tile0 = (VkExtent2D) {
       .width = util_align_npot(fb->width, tile_align_w),
       .height = align(fb->height, tile_align_h),
    };
@@ -102,138 +104,138 @@ tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
    /* will force to sysmem, don't bother trying to have a valid tile config
     * TODO: just skip all GMEM stuff when sysmem is forced?
     */
-   if (!pass->gmem_pixels)
+   if (!pass->gmem_pixels[gmem_layout])
       return;
 
    if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
       /* start with 2x2 tiles */
-      fb->tile_count.width = 2;
-      fb->tile_count.height = 2;
-      fb->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w);
-      fb->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h);
+      tiling->tile_count.width = 2;
+      tiling->tile_count.height = 2;
+      tiling->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w);
+      tiling->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h);
    }
 
    /* do not exceed max tile width */
-   while (fb->tile0.width > max_tile_width) {
-      fb->tile_count.width++;
-      fb->tile0.width =
-         util_align_npot(DIV_ROUND_UP(fb->width, fb->tile_count.width), tile_align_w);
+   while (tiling->tile0.width > max_tile_width) {
+      tiling->tile_count.width++;
+      tiling->tile0.width =
+         util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w);
    }
 
    /* do not exceed max tile height */
-   while (fb->tile0.height > max_tile_height) {
-      fb->tile_count.height++;
-      fb->tile0.height =
-         util_align_npot(DIV_ROUND_UP(fb->height, fb->tile_count.height), tile_align_h);
+   while (tiling->tile0.height > max_tile_height) {
+      tiling->tile_count.height++;
+      tiling->tile0.height =
+         util_align_npot(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h);
    }
 
    /* do not exceed gmem size */
-   while (fb->tile0.width * fb->tile0.height > pass->gmem_pixels) {
-      if (fb->tile0.width > MAX2(tile_align_w, fb->tile0.height)) {
-         fb->tile_count.width++;
-         fb->tile0.width =
-            util_align_npot(DIV_ROUND_UP(fb->width, fb->tile_count.width), tile_align_w);
+   while (tiling->tile0.width * tiling->tile0.height > pass->gmem_pixels[gmem_layout]) {
+      if (tiling->tile0.width > MAX2(tile_align_w, tiling->tile0.height)) {
+         tiling->tile_count.width++;
+         tiling->tile0.width =
+            util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w);
       } else {
          /* if this assert fails then layout is impossible.. */
-         assert(fb->tile0.height > tile_align_h);
-         fb->tile_count.height++;
-         fb->tile0.height =
-            align(DIV_ROUND_UP(fb->height, fb->tile_count.height), tile_align_h);
+         assert(tiling->tile0.height > tile_align_h);
+         tiling->tile_count.height++;
+         tiling->tile0.height =
+            align(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h);
       }
    }
 }
 
 static void
-tu_tiling_config_update_pipe_layout(struct tu_framebuffer *fb,
+tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
                                     const struct tu_device *dev)
 {
    const uint32_t max_pipe_count = 32; /* A6xx */
 
    /* start from 1 tile per pipe */
-   fb->pipe0 = (VkExtent2D) {
+   tiling->pipe0 = (VkExtent2D) {
       .width = 1,
       .height = 1,
    };
-   fb->pipe_count = fb->tile_count;
+   tiling->pipe_count = tiling->tile_count;
 
-   while (fb->pipe_count.width * fb->pipe_count.height > max_pipe_count) {
-      if (fb->pipe0.width < fb->pipe0.height) {
-         fb->pipe0.width += 1;
-         fb->pipe_count.width =
-            DIV_ROUND_UP(fb->tile_count.width, fb->pipe0.width);
+   while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
+      if (tiling->pipe0.width < tiling->pipe0.height) {
+         tiling->pipe0.width += 1;
+         tiling->pipe_count.width =
+            DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
       } else {
-         fb->pipe0.height += 1;
-         fb->pipe_count.height =
-            DIV_ROUND_UP(fb->tile_count.height, fb->pipe0.height);
+         tiling->pipe0.height += 1;
+         tiling->pipe_count.height =
+            DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
       }
    }
 }
 
 static void
-tu_tiling_config_update_pipes(struct tu_framebuffer *fb,
+tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
                               const struct tu_device *dev)
 {
    const uint32_t max_pipe_count = 32; /* A6xx */
    const uint32_t used_pipe_count =
-      fb->pipe_count.width * fb->pipe_count.height;
+      tiling->pipe_count.width * tiling->pipe_count.height;
    const VkExtent2D last_pipe = {
-      .width = (fb->tile_count.width - 1) % fb->pipe0.width + 1,
-      .height = (fb->tile_count.height - 1) % fb->pipe0.height + 1,
+      .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
+      .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
    };
 
    assert(used_pipe_count <= max_pipe_count);
-   assert(max_pipe_count <= ARRAY_SIZE(fb->pipe_config));
+   assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
 
-   for (uint32_t y = 0; y < fb->pipe_count.height; y++) {
-      for (uint32_t x = 0; x < fb->pipe_count.width; x++) {
-         const uint32_t pipe_x = fb->pipe0.width * x;
-         const uint32_t pipe_y = fb->pipe0.height * y;
-         const uint32_t pipe_w = (x == fb->pipe_count.width - 1)
+   for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
+      for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
+         const uint32_t pipe_x = tiling->pipe0.width * x;
+         const uint32_t pipe_y = tiling->pipe0.height * y;
+         const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
                                     ? last_pipe.width
-                                    : fb->pipe0.width;
-         const uint32_t pipe_h = (y == fb->pipe_count.height - 1)
+                                    : tiling->pipe0.width;
+         const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
                                     ? last_pipe.height
-                                    : fb->pipe0.height;
-         const uint32_t n = fb->pipe_count.width * y + x;
+                                    : tiling->pipe0.height;
+         const uint32_t n = tiling->pipe_count.width * y + x;
 
-         fb->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
+         tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
                                   A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
                                   A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
                                   A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
-         fb->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
+         tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
       }
    }
 
-   memset(fb->pipe_config + used_pipe_count, 0,
+   memset(tiling->pipe_config + used_pipe_count, 0,
           sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
 }
 
 static bool
-is_hw_binning_possible(const struct tu_framebuffer *fb)
+is_hw_binning_possible(const struct tu_tiling_config *tiling)
 {
    /* Similar to older gens, # of tiles per pipe cannot be more than 32.
     * But there are no hangs with 16 or more tiles per pipe in either
     * X or Y direction, so that limit does not seem to apply.
     */
-   uint32_t tiles_per_pipe = fb->pipe0.width * fb->pipe0.height;
+   uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
    return tiles_per_pipe <= 32;
 }
 
 static void
-tu_tiling_config_update_binning(struct tu_framebuffer *fb, const struct tu_device *device)
+tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
 {
-   fb->binning_possible = is_hw_binning_possible(fb);
+   tiling->binning_possible = is_hw_binning_possible(tiling);
 
-   if (fb->binning_possible) {
-      fb->binning = (fb->tile_count.width * fb->tile_count.height) > 2;
+   if (tiling->binning_possible) {
+      tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;
 
       if (unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
-         fb->binning = true;
+         tiling->binning = true;
       if (unlikely(device->physical_device->instance->debug_flags &
                    TU_DEBUG_NOBIN))
-         fb->binning = false;
+         tiling->binning = false;
    } else {
-      fb->binning = false;
+      tiling->binning = false;
    }
 }
 
@@ -242,10 +244,13 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
                              const struct tu_device *device,
                              const struct tu_render_pass *pass)
 {
-   tu_tiling_config_update_tile_layout(fb, device, pass);
-   tu_tiling_config_update_pipe_layout(fb, device);
-   tu_tiling_config_update_pipes(fb, device);
-   tu_tiling_config_update_binning(fb, device);
+   for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
+      struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
+      tu_tiling_config_update_tile_layout(fb, device, pass, gmem_layout);
+      tu_tiling_config_update_pipe_layout(tiling, device);
+      tu_tiling_config_update_pipes(tiling, device);
+      tu_tiling_config_update_binning(tiling, device);
+   }
 }
 
 void