diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index 57288379cd8e7..3272f9c5bcebb 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -1039,7 +1039,7 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd, desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); desc[2] = A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | - A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp); + A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp); desc[3] = 0; desc[4] = cmd->device->physical_device->gmem_base + gmem_offset; desc[5] = A6XX_TEX_CONST_5_DEPTH(1); @@ -2717,13 +2717,14 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd, enum pipe_format format = tu_vk_format_to_pipe_format(att->format); if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) - clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, att->gmem_offset, value); + clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, tu_attachment_gmem_offset(cmd, att), value); if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) - clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value); + clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, tu_attachment_gmem_offset_stencil(cmd, att), value); return; } - clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), att->gmem_offset, value); + clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), + tu_attachment_gmem_offset(cmd, att), value); trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples); } @@ -2789,12 +2790,15 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer, tu_lrz_disable_during_renderpass(cmd); } - /* vkCmdClearAttachments is supposed to respect the predicate if active. - * The easiest way to do this is to always use the 3d path, which always - * works even with GMEM because it's just a simple draw using the existing + /* vkCmdClearAttachments is supposed to respect the predicate if active. The + * easiest way to do this is to always use the 3d path, which always works + * even with GMEM because it's just a simple draw using the existing * attachment state. + * + * Similarly, we also use the 3D path when in a secondary command buffer that + * doesn't know the GMEM layout that will be chosen by the primary. */ - if (cmd->state.predication_active) { + if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) { tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); return; } @@ -2981,10 +2985,10 @@ tu_emit_blit(struct tu_cmd_buffer *cmd, if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) { tu_cs_emit_regs(cs, - A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil)); + A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment))); } else { tu_cs_emit_regs(cs, - A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset)); + A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment))); } tu6_emit_event_write(cmd, cs, BLIT); @@ -3156,7 +3160,7 @@ store_cp_blit(struct tu_cmd_buffer *cmd, /* note: src size does not matter when not scaling */ A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff), A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset), - A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp)); + A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.tiling->tile0.width * cpp)); /* sync GMEM writes with CACHE. */ tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); @@ -3243,6 +3247,61 @@ store_3d_blit(struct tu_cmd_buffer *cmd, CP_SCRATCH_TO_REG_0_CNT(1 - 1)); } +static bool +tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a) +{ + struct tu_physical_device *phys_dev = cmd->device->physical_device; + const struct tu_image_view *iview = cmd->state.attachments[a]; + const VkRect2D *render_area = &cmd->state.render_area; + + /* Unaligned store is incredibly rare in CTS, we have to force it to test. */ + if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE)) + return true; + + uint32_t x1 = render_area->offset.x; + uint32_t y1 = render_area->offset.y; + uint32_t x2 = x1 + render_area->extent.width; + uint32_t y2 = y1 + render_area->extent.height; + /* x2/y2 can be unaligned if equal to the size of the image, since it will + * write into padding space. The one exception is linear levels which don't + * have the required y padding in the layout (except for the last level) + */ + bool need_y2_align = + y2 != iview->view.height || iview->view.need_y2_align; + + return (x1 % phys_dev->info->gmem_align_w || + (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) || + y1 % phys_dev->info->gmem_align_h || + (y2 % phys_dev->info->gmem_align_h && need_y2_align)); +} + +/* Choose the GMEM layout (use the CCU space or not) based on whether the + * current attachments will need. This has to happen at vkBeginRenderPass() + * time because tu_attachment_store_unaligned() looks at the image views, which + * are only available at that point. This should match the logic for the + * !unaligned case in tu_store_gmem_attachment(). + */ +void +tu_choose_gmem_layout(struct tu_cmd_buffer *cmd) +{ + cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL; + + for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) { + if (!cmd->state.attachments[i]) + continue; + + struct tu_render_pass_attachment *att = + &cmd->state.pass->attachments[i]; + if ((att->store || att->store_stencil) && + tu_attachment_store_unaligned(cmd, i)) + cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU; + if (att->will_be_resolved && !blit_can_resolve(att->format)) + cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU; + } + + cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout]; +} + void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3250,7 +3309,6 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, uint32_t gmem_a, bool cond_exec_allowed) { - struct tu_physical_device *phys_dev = cmd->device->physical_device; const VkRect2D *render_area = &cmd->state.render_area; struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a]; const struct tu_image_view *iview = cmd->state.attachments[a]; @@ -3267,26 +3325,7 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, tu_begin_load_store_cond_exec(cmd, cs, false); } - uint32_t x1 = render_area->offset.x; - uint32_t y1 = render_area->offset.y; - uint32_t x2 = x1 + render_area->extent.width; - uint32_t y2 = y1 + render_area->extent.height; - /* x2/y2 can be unaligned if equal to the size of the image, - * since it will write into padding space - * the one exception is linear levels which don't have the - * required y padding in the layout (except for the last level) - */ - bool need_y2_align = - y2 != iview->view.height || iview->view.need_y2_align; - - bool unaligned = - x1 % phys_dev->info->gmem_align_w || - (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) || - y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align); - - /* Unaligned store is incredibly rare in CTS, we have to force it to test. */ - if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_UNALIGNED_STORE)) - unaligned = true; + bool unaligned = tu_attachment_store_unaligned(cmd, a); /* D32_SFLOAT_S8_UINT is quite special format: it has two planes, * one for depth and other for stencil. When resolving a MSAA @@ -3324,6 +3363,8 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, return; } + assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU); + enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format); if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) src_format = PIPE_FORMAT_Z32_FLOAT; @@ -3345,23 +3386,23 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, if (store_common) { store_3d_blit(cmd, cs, iview, dst->samples, false, src_format, - dst_format, render_area, src->gmem_offset, src->cpp); + dst_format, render_area, tu_attachment_gmem_offset(cmd, src), src->cpp); } if (store_separate_stencil) { store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT, PIPE_FORMAT_S8_UINT, render_area, - src->gmem_offset_stencil, src->samples); + tu_attachment_gmem_offset_stencil(cmd, src), src->samples); } } else { r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent); if (store_common) { store_cp_blit(cmd, cs, iview, src->samples, false, src_format, - dst_format, src->gmem_offset, src->cpp); + dst_format, tu_attachment_gmem_offset(cmd, src), src->cpp); } if (store_separate_stencil) { store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT, - PIPE_FORMAT_S8_UINT, src->gmem_offset_stencil, src->samples); + PIPE_FORMAT_S8_UINT, tu_attachment_gmem_offset_stencil(cmd, src), src->samples); } } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index d6e2608ce1b84..b89e03abd1ab8 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -235,7 +235,7 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd, tu_cs_image_depth_ref(cs, iview, 0); else tu_cs_image_ref(cs, &iview->view, 0); - tu_cs_emit(cs, attachment->gmem_offset); + tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment)); tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); @@ -250,10 +250,10 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd, tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value); if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { tu_cs_image_stencil_ref(cs, iview, 0); - tu_cs_emit(cs, attachment->gmem_offset_stencil); + tu_cs_emit(cs, tu_attachment_gmem_offset_stencil(cmd, attachment)); } else { tu_cs_image_ref(cs, &iview->view, 0); - tu_cs_emit(cs, attachment->gmem_offset); + tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment)); } } else { tu_cs_emit_regs(cs, @@ -294,7 +294,7 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6); tu_cs_emit(cs, iview->view.RB_MRT_BUF_INFO); tu_cs_image_ref(cs, &iview->view, 0); - tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset); + tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, &cmd->state.pass->attachments[a])); tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(i, .dword = iview->view.SP_FS_MRT_REG)); @@ -565,6 +565,7 @@ static bool use_hw_binning(struct tu_cmd_buffer *cmd) { const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout]; /* XFB commands are emitted for BINNING || SYSMEM, which makes it * incompatible with non-hw binning GMEM rendering. this is required because @@ -573,7 +574,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd) * XFB was used. */ if (cmd->state.rp.xfb_used) { - assert(fb->binning_possible); + assert(tiling->binning_possible); return true; } @@ -584,11 +585,11 @@ use_hw_binning(struct tu_cmd_buffer *cmd) */ if (cmd->state.rp.has_prim_generated_query_in_rp || cmd->state.prim_generated_query_running_before_rp) { - assert(fb->binning_possible); + assert(tiling->binning_possible); return true; } - return fb->binning; + return tiling->binning; } static bool @@ -599,7 +600,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, return true; /* can't fit attachments into gmem */ - if (!cmd->state.pass->gmem_pixels) + if (!cmd->state.pass->gmem_pixels[cmd->state.gmem_layout]) return true; if (cmd->state.framebuffer->layers > 1) @@ -617,7 +618,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, return true; /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */ - if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible) + if (cmd->state.rp.xfb_used && !cmd->state.tiling->binning_possible) return true; /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning @@ -625,7 +626,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, */ if ((cmd->state.rp.has_prim_generated_query_in_rp || cmd->state.prim_generated_query_running_before_rp) && - !cmd->state.framebuffer->binning_possible) + !cmd->state.tiling->binning_possible) return true; if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_GMEM)) @@ -649,7 +650,7 @@ static void tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t pipe, uint32_t slot, bool wfm) { - if (cmd->state.framebuffer->binning_possible) { + if (cmd->state.tiling->binning_possible) { tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) | A6XX_CP_REG_TEST_0_BIT(slot) | @@ -664,15 +665,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = cmd->state.tiling; tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM)); - const uint32_t x1 = fb->tile0.width * tx; - const uint32_t y1 = fb->tile0.height * ty; - const uint32_t x2 = MIN2(x1 + fb->tile0.width - 1, MAX_VIEWPORT_SIZE - 1); - const uint32_t y2 = MIN2(y1 + fb->tile0.height - 1, MAX_VIEWPORT_SIZE - 1); + const uint32_t x1 = tiling->tile0.width * tx; + const uint32_t y1 = tiling->tile0.height * ty; + const uint32_t x2 = MIN2(x1 + tiling->tile0.width - 1, MAX_VIEWPORT_SIZE - 1); + const uint32_t y2 = MIN2(y1 + tiling->tile0.height - 1, MAX_VIEWPORT_SIZE - 1); tu6_emit_window_scissor(cs, x1, y1, x2, y2); tu6_emit_window_offset(cs, x1, y1); @@ -685,7 +686,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu_cs_emit(cs, 0x0); tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4); - tu_cs_emit(cs, fb->pipe_sizes[pipe] | + tu_cs_emit(cs, tiling->pipe_sizes[pipe] | CP_SET_BIN_DATA5_0_VSC_N(slot)); tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch); tu_cs_emit(cs, pipe * 4); @@ -769,7 +770,7 @@ tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_emit_blit_scissor(cmd, cs, true); for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_load_gmem_attachment(cmd, cs, i, cmd->state.framebuffer->binning, false); + tu_load_gmem_attachment(cmd, cs, i, cmd->state.tiling->binning, false); } static void @@ -787,8 +788,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_emit_blit_scissor(cmd, cs, true); for (uint32_t a = 0; a < pass->attachment_count; ++a) { - if (pass->attachments[a].gmem_offset >= 0) - tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.framebuffer->binning_possible); + if (pass->attachments[a].gmem) + tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.tiling->binning_possible); } if (subpass->resolve_attachments) { @@ -965,18 +966,18 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) static void update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = cmd->state.tiling; tu_cs_emit_regs(cs, - A6XX_VSC_BIN_SIZE(.width = fb->tile0.width, - .height = fb->tile0.height)); + A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width, + .height = tiling->tile0.height)); tu_cs_emit_regs(cs, - A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width, - .ny = fb->tile_count.height)); + A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width, + .ny = tiling->tile_count.height)); tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); - tu_cs_emit_array(cs, fb->pipe_config, 32); + tu_cs_emit_array(cs, tiling->pipe_config, 32); tu_cs_emit_regs(cs, A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch), @@ -990,9 +991,9 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) static void emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = cmd->state.tiling; const uint32_t used_pipe_count = - fb->pipe_count.width * fb->pipe_count.height; + tiling->pipe_count.width * tiling->pipe_count.height; for (int i = 0; i < used_pipe_count; i++) { tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); @@ -1110,6 +1111,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass, bool gmem) { + const struct tu_tiling_config *tiling = cmd->state.tiling; + /* note: we can probably emit input attachments just once for the whole * renderpass, this would avoid emitting both sysmem/gmem versions * @@ -1140,7 +1143,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i]; - uint32_t gmem_offset = att->gmem_offset; + uint32_t gmem_offset = tu_attachment_gmem_offset(cmd, att); uint32_t cpp = att->cpp; memcpy(dst, iview->view.descriptor, A6XX_TEX_CONST_DWORDS * 4); @@ -1198,7 +1201,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32; cpp = att->samples; - gmem_offset = att->gmem_offset_stencil; + gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout]; } if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem) @@ -1209,7 +1212,7 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); dst[2] = A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | - A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp); + A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp); dst[3] = 0; dst[4] = cmd->device->physical_device->gmem_base + gmem_offset; dst[5] = A6XX_TEX_CONST_5_DEPTH(1); @@ -1336,7 +1339,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_renderpass_result *autotune_result) { struct tu_physical_device *phys_dev = cmd->device->physical_device; - + const struct tu_tiling_config *tiling = cmd->state.tiling; tu_lrz_tiling_begin(cmd, cs); tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); @@ -1344,9 +1347,8 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); - const struct tu_framebuffer *fb = cmd->state.framebuffer; if (use_hw_binning(cmd)) { - tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) | A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); @@ -1354,7 +1356,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu6_emit_binning_pass(cmd, cs); - tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS | A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); @@ -1370,14 +1372,14 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x1); } else { - tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); - if (fb->binning_possible) { + if (tiling->binning_possible) { /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since * the actual binner didn't run. */ - int pipe_count = fb->pipe_count.width * fb->pipe_count.height; + int pipe_count = tiling->pipe_count.width * tiling->pipe_count.height; tu_cs_emit_pkt4(cs, REG_A6XX_VSC_STATE_REG(0), pipe_count); for (int i = 0; i < pipe_count; i++) tu_cs_emit(cs, ~0); @@ -1453,6 +1455,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, struct tu_renderpass_result *autotune_result) { const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = cmd->state.tiling; /* Create gmem stores now (at EndRenderPass time)) because they needed to * know whether to allow their conditional execution, which was tied to a @@ -1468,19 +1471,19 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, /* Note: we reverse the order of walking the pipes and tiles on every * other row, to improve texture cache locality compared to raster order. */ - for (uint32_t py = 0; py < fb->pipe_count.height; py++) { - uint32_t pipe_row = py * fb->pipe_count.width; - for (uint32_t pipe_row_i = 0; pipe_row_i < fb->pipe_count.width; pipe_row_i++) { + for (uint32_t py = 0; py < tiling->pipe_count.height; py++) { + uint32_t pipe_row = py * tiling->pipe_count.width; + for (uint32_t pipe_row_i = 0; pipe_row_i < tiling->pipe_count.width; pipe_row_i++) { uint32_t px; if (py & 1) - px = fb->pipe_count.width - 1 - pipe_row_i; + px = tiling->pipe_count.width - 1 - pipe_row_i; else px = pipe_row_i; uint32_t pipe = pipe_row + px; - uint32_t tx1 = px * fb->pipe0.width; - uint32_t ty1 = py * fb->pipe0.height; - uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width); - uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height); + uint32_t tx1 = px * tiling->pipe0.width; + uint32_t ty1 = py * tiling->pipe0.height; + uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width); + uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height); uint32_t tile_row_stride = tx2 - tx1; uint32_t slot_row = 0; for (uint32_t ty = ty1; ty < ty2; ty++) { @@ -1500,7 +1503,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, tu6_tile_render_end(cmd, &cmd->cs, autotune_result); - trace_end_render_pass(&cmd->trace, &cmd->cs, fb); + trace_end_render_pass(&cmd->trace, &cmd->cs, fb, tiling); if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) u_trace_disable_event_range(cmd->trace_renderpass_start, @@ -1526,7 +1529,7 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result); - trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer); + trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer, cmd->state.tiling); } void @@ -1562,6 +1565,7 @@ static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer) cmd_buffer->state.subpass = NULL; cmd_buffer->state.framebuffer = NULL; cmd_buffer->state.attachments = NULL; + cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */ memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp)); /* LRZ is not valid next time we use it */ @@ -1798,6 +1802,7 @@ tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer, memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); cmd_buffer->state.index_size = 0xff; /* dirty restart index */ cmd_buffer->state.line_mode = RECTANGULAR; + cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* dirty value */ tu_cache_init(&cmd_buffer->state.cache); tu_cache_init(&cmd_buffer->state.renderpass_cache); @@ -1867,6 +1872,12 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; } + + /* We can't set the gmem layout here, because the state.pass only has + * to be compatible (same formats/sample counts) with the primary's + * renderpass, rather than exactly equal. + */ + tu_lrz_begin_secondary_cmdbuf(cmd_buffer); } else { /* When executing in the middle of another command buffer, the CCU @@ -3424,6 +3435,8 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer; cmd->state.attachments = suspended->state.suspended_pass.attachments; cmd->state.render_area = suspended->state.suspended_pass.render_area; + cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout; + cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout]; cmd->state.lrz = suspended->state.suspended_pass.lrz; } @@ -3866,6 +3879,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) : cmd->state.framebuffer->attachments[i].attachment; } + tu_choose_gmem_layout(cmd); trace_start_render_pass(&cmd->trace, &cmd->cs); @@ -3970,6 +3984,8 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer, } } + tu_choose_gmem_layout(cmd); + cmd->state.renderpass_cache.pending_flush_bits = cmd->state.cache.pending_flush_bits; cmd->state.renderpass_cache.flush_bits = 0; @@ -3999,6 +4015,7 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer, cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer; cmd->state.suspended_pass.render_area = cmd->state.render_area; cmd->state.suspended_pass.attachments = cmd->state.attachments; + cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout; } if (!resuming) { @@ -4078,7 +4095,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, tu_store_gmem_attachment(cmd, cs, a, gmem_a, false); - if (pass->attachments[a].gmem_offset < 0) + if (!pass->attachments[a].gmem) continue; /* check if the resolved attachment is needed by later subpasses, diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c index 2ecbb47527baa..27af02469ff02 100644 --- a/src/freedreno/vulkan/tu_pass.c +++ b/src/freedreno/vulkan/tu_pass.c @@ -113,7 +113,8 @@ tu_render_pass_add_subpass_dep(struct tu_render_pass *pass, if (dep_invalid_for_gmem(dep, src_stage_mask, dst_stage_mask)) { perf_debug((struct tu_device *)pass->base.device, "Disabling gmem rendering due to invalid subpass dependency"); - pass->gmem_pixels = 0; + for (int i = 0; i < ARRAY_SIZE(pass->gmem_pixels); i++) + pass->gmem_pixels[i] = 0; } struct tu_subpass_barrier *dst_barrier; @@ -540,103 +541,112 @@ static void tu_render_pass_gmem_config(struct tu_render_pass *pass, const struct tu_physical_device *phys_dev) { - /* From the VK_KHR_multiview spec: - * - * Multiview is all-or-nothing for a render pass - that is, either all - * subpasses must have a non-zero view mask (though some subpasses may - * have only one view) or all must be zero. - * - * This means we only have to check one of the view masks. - */ - if (pass->subpasses[0].multiview_mask) { - /* It seems multiview must use sysmem rendering. */ - pass->gmem_pixels = 0; - return; - } + for (enum tu_gmem_layout layout = 0; layout < TU_GMEM_LAYOUT_COUNT; + layout++) { + /* From the VK_KHR_multiview spec: + * + * Multiview is all-or-nothing for a render pass - that is, either all + * subpasses must have a non-zero view mask (though some subpasses may + * have only one view) or all must be zero. + * + * This means we only have to check one of the view masks. + */ + if (pass->subpasses[0].multiview_mask) { + /* It seems multiview must use sysmem rendering. */ + pass->gmem_pixels[layout] = 0; + continue; + } - uint32_t block_align_shift = 3; /* log2(gmem_align/(tile_align_w*tile_align_h)) */ - uint32_t tile_align_w = phys_dev->info->tile_align_w; - uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * phys_dev->info->tile_align_h; + /* log2(gmem_align/(tile_align_w*tile_align_h)) */ + uint32_t block_align_shift = 3; + uint32_t tile_align_w = phys_dev->info->tile_align_w; + uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * + phys_dev->info->tile_align_h; - /* calculate total bytes per pixel */ - uint32_t cpp_total = 0; - for (uint32_t i = 0; i < pass->attachment_count; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[i]; - bool cpp1 = (att->cpp == 1); - if (att->gmem_offset >= 0) { - cpp_total += att->cpp; + /* calculate total bytes per pixel */ + uint32_t cpp_total = 0; + for (uint32_t i = 0; i < pass->attachment_count; i++) { + struct tu_render_pass_attachment *att = &pass->attachments[i]; + bool cpp1 = (att->cpp == 1); + if (att->gmem) { + cpp_total += att->cpp; - /* take into account the separate stencil: */ - if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - cpp1 = (att->samples == 1); - cpp_total += att->samples; - } + /* take into account the separate stencil: */ + if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + cpp1 = (att->samples == 1); + cpp_total += att->samples; + } - /* texture pitch must be aligned to 64, use a tile_align_w that is - * a multiple of 64 for cpp==1 attachment to work as input attachment - */ - if (cpp1 && tile_align_w % 64 != 0) { - tile_align_w *= 2; - block_align_shift -= 1; + /* texture pitch must be aligned to 64, use a tile_align_w that is + * a multiple of 64 for cpp==1 attachment to work as input + * attachment + */ + if (cpp1 && tile_align_w % 64 != 0) { + tile_align_w *= 2; + block_align_shift -= 1; + } } } - } - pass->tile_align_w = tile_align_w; + pass->tile_align_w = tile_align_w; - /* no gmem attachments */ - if (cpp_total == 0) { - /* any value non-zero value so tiling config works with no attachments */ - pass->gmem_pixels = 1024*1024; - return; - } - - /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path - * doesn't break things. maybe there is a better solution? - * TODO: this algorithm isn't optimal - * for example, two attachments with cpp = {1, 4} - * result: nblocks = {12, 52}, pixels = 196608 - * optimal: nblocks = {13, 51}, pixels = 208896 - */ - uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align; - uint32_t offset = 0, pixels = ~0u, i; - for (i = 0; i < pass->attachment_count; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[i]; - if (att->gmem_offset < 0) + /* no gmem attachments */ + if (cpp_total == 0) { + /* any value non-zero value so tiling config works with no + * attachments + */ + pass->gmem_pixels[layout] = 1024 * 1024; continue; + } - att->gmem_offset = offset; + /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path + * doesn't break things. maybe there is a better solution? + * TODO: this algorithm isn't optimal + * for example, two attachments with cpp = {1, 4} + * result: nblocks = {12, 52}, pixels = 196608 + * optimal: nblocks = {13, 51}, pixels = 208896 + */ + uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align; + uint32_t offset = 0, pixels = ~0u, i; + for (i = 0; i < pass->attachment_count; i++) { + struct tu_render_pass_attachment *att = &pass->attachments[i]; + if (!att->gmem) + continue; - uint32_t align = MAX2(1, att->cpp >> block_align_shift); - uint32_t nblocks = MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align); + att->gmem_offset[layout] = offset; - if (nblocks > gmem_blocks) - break; + uint32_t align = MAX2(1, att->cpp >> block_align_shift); + uint32_t nblocks = + MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align); - gmem_blocks -= nblocks; - cpp_total -= att->cpp; - offset += nblocks * gmem_align; - pixels = MIN2(pixels, nblocks * gmem_align / att->cpp); - - /* repeat the same for separate stencil */ - if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - att->gmem_offset_stencil = offset; - - /* note: for s8_uint, block align is always 1 */ - uint32_t nblocks = gmem_blocks * att->samples / cpp_total; if (nblocks > gmem_blocks) break; gmem_blocks -= nblocks; - cpp_total -= att->samples; + cpp_total -= att->cpp; offset += nblocks * gmem_align; - pixels = MIN2(pixels, nblocks * gmem_align / att->samples); - } - } + pixels = MIN2(pixels, nblocks * gmem_align / att->cpp); - /* if the loop didn't complete then the gmem config is impossible */ - if (i == pass->attachment_count) - pass->gmem_pixels = pixels; + /* repeat the same for separate stencil */ + if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { + att->gmem_offset_stencil[layout] = offset; + + /* note: for s8_uint, block align is always 1 */ + uint32_t nblocks = gmem_blocks * att->samples / cpp_total; + if (nblocks > gmem_blocks) + break; + + gmem_blocks -= nblocks; + cpp_total -= att->samples; + offset += nblocks * gmem_align; + pixels = MIN2(pixels, nblocks * gmem_align / att->samples); + } + } + + /* if the loop didn't complete then the gmem config is impossible */ + if (i == pass->attachment_count) + pass->gmem_pixels[layout] = pixels; + } } static void @@ -737,7 +747,7 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const { struct tu_subpass *subpass = &pass->subpasses[i]; - pass->attachments[a].gmem_offset = 0; + pass->attachments[a].gmem = true; update_samples(subpass, pCreateInfo->pAttachments[a].samples); pass->attachments[a].clear_views |= subpass->multiview_mask; } @@ -786,7 +796,8 @@ tu_CreateRenderPass2(VkDevice _device, att->cpp = 4 * att->samples; else att->cpp = vk_format_get_blocksize(att->format) * att->samples; - att->gmem_offset = -1; + /* Initially not allocated into gmem, tu_subpass_use_attachment() will move it there. */ + att->gmem = false; VkAttachmentLoadOp loadOp = pCreateInfo->pAttachments[i].loadOp; VkAttachmentLoadOp stencilLoadOp = pCreateInfo->pAttachments[i].stencilLoadOp; @@ -916,7 +927,7 @@ tu_CreateRenderPass2(VkDevice _device, /* disable unused attachments */ for (uint32_t i = 0; i < pass->attachment_count; i++) { struct tu_render_pass_attachment *att = &pass->attachments[i]; - if (att->gmem_offset < 0) { + if (!att->gmem) { att->clear_mask = 0; att->load = false; } @@ -1009,7 +1020,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer, TU_FROM_HANDLE(tu_image_view, view, att_info->imageView); tu_setup_dynamic_attachment(att, view); - att->gmem_offset = 0; + att->gmem = true; att->clear_views = info->viewMask; attachment_set_ops(device, att, att_info->loadOp, 0, att_info->storeOp, 0); @@ -1024,7 +1035,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer, struct tu_render_pass_attachment *resolve_att = &pass->attachments[a]; TU_FROM_HANDLE(tu_image_view, resolve_view, att_info->resolveImageView); tu_setup_dynamic_attachment(resolve_att, resolve_view); - resolve_att->gmem_offset = -1; + resolve_att->gmem = false; attachment_set_ops(device, resolve_att, VK_ATTACHMENT_LOAD_OP_DONT_CARE, 0, VK_ATTACHMENT_STORE_OP_STORE, 0); @@ -1048,7 +1059,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer, struct tu_render_pass_attachment *att = &pass->attachments[a]; tu_setup_dynamic_attachment(att, view); - att->gmem_offset = 0; + att->gmem = true; att->clear_views = info->viewMask; subpass->depth_stencil_attachment.attachment = a++; @@ -1066,7 +1077,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer, TU_FROM_HANDLE(tu_image_view, resolve_view, common_info->resolveImageView); tu_setup_dynamic_attachment(resolve_att, resolve_view); - resolve_att->gmem_offset = -1; + resolve_att->gmem = false; attachment_set_ops(device, resolve_att, VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_LOAD_OP_DONT_CARE, diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index a68b712f12d04..ee075b6a963af 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -942,14 +942,7 @@ struct tu_attachment_info struct tu_image_view *attachment; }; -struct tu_framebuffer -{ - struct vk_object_base base; - - uint32_t width; - uint32_t height; - uint32_t layers; - +struct tu_tiling_config { /* size of the first tile */ VkExtent2D tile0; /* number of tiles */ @@ -969,6 +962,27 @@ struct tu_framebuffer /* pipe register values */ uint32_t pipe_config[MAX_VSC_PIPES]; uint32_t pipe_sizes[MAX_VSC_PIPES]; +}; + +enum tu_gmem_layout +{ + /* Use all of GMEM for attachments */ + TU_GMEM_LAYOUT_FULL, + /* Avoid using the region of GMEM that the CCU needs */ + TU_GMEM_LAYOUT_AVOID_CCU, + /* Number of layouts we have, also the value set when we don't know the layout in a secondary. */ + TU_GMEM_LAYOUT_COUNT, +}; + +struct tu_framebuffer +{ + struct vk_object_base base; + + uint32_t width; + uint32_t height; + uint32_t layers; + + struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT]; uint32_t attachment_count; struct tu_attachment_info attachments[0]; @@ -1031,7 +1045,8 @@ struct tu_render_pass_attachment uint32_t clear_views; bool load; bool store; - int32_t gmem_offset; + bool gmem; + int32_t gmem_offset[TU_GMEM_LAYOUT_COUNT]; bool will_be_resolved; /* for D32S8 separate stencil: */ bool load_stencil; @@ -1040,7 +1055,7 @@ struct tu_render_pass_attachment bool cond_load_allowed; bool cond_store_allowed; - int32_t gmem_offset_stencil; + int32_t gmem_offset_stencil[TU_GMEM_LAYOUT_COUNT]; }; struct tu_render_pass @@ -1049,7 +1064,7 @@ struct tu_render_pass uint32_t attachment_count; uint32_t subpass_count; - uint32_t gmem_pixels; + uint32_t gmem_pixels[TU_GMEM_LAYOUT_COUNT]; uint32_t tile_align_w; /* memory bandwidth costs (in bytes) for gmem / sysmem rendering */ @@ -1425,9 +1440,15 @@ struct tu_cmd_state enum tu_cmd_ccu_state ccu_state; + /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU + * might get used by tu_store_gmem_attachment(). + */ + enum tu_gmem_layout gmem_layout; + const struct tu_render_pass *pass; const struct tu_subpass *subpass; const struct tu_framebuffer *framebuffer; + const struct tu_tiling_config *tiling; VkRect2D render_area; const struct tu_image_view **attachments; @@ -1442,6 +1463,7 @@ struct tu_cmd_state const struct tu_subpass *subpass; const struct tu_framebuffer *framebuffer; VkRect2D render_area; + enum tu_gmem_layout gmem_layout; const struct tu_image_view **attachments; @@ -1645,6 +1667,22 @@ struct tu_cmd_buffer uint32_t vsc_prim_strm_pitch; }; +static inline uint32_t +tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd, + const struct tu_render_pass_attachment *att) +{ + assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT); + return att->gmem_offset[cmd->state.gmem_layout]; +} + +static inline uint32_t +tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd, + const struct tu_render_pass_attachment *att) +{ + assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT); + return att->gmem_offset_stencil[cmd->state.gmem_layout]; +} + /* Temporary struct for tracking a register state to be written, used by * a6xx-pack.h and tu_cs_emit_regs() */ @@ -2054,6 +2092,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, uint32_t gmem_a, bool cond_exec_allowed); +void +tu_choose_gmem_layout(struct tu_cmd_buffer *cmd); + enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); struct tu_native_format diff --git a/src/freedreno/vulkan/tu_tracepoints.py b/src/freedreno/vulkan/tu_tracepoints.py index ba65a6ae64543..9b3049285f6fa 100644 --- a/src/freedreno/vulkan/tu_tracepoints.py +++ b/src/freedreno/vulkan/tu_tracepoints.py @@ -76,14 +76,15 @@ def begin_end_tp(name, args=[], tp_struct=None, tp_print=None, begin_end_tp('render_pass', - args=[ArgStruct(type='const struct tu_framebuffer *', var='fb')], + args=[ArgStruct(type='const struct tu_framebuffer *', var='fb'), + ArgStruct(type='const struct tu_tiling_config *', var='tiling')], tp_struct=[Arg(type='uint16_t', name='width', var='fb->width', c_format='%u'), Arg(type='uint16_t', name='height', var='fb->height', c_format='%u'), Arg(type='uint8_t', name='MRTs', var='fb->attachment_count', c_format='%u'), # Arg(type='uint8_t', name='samples', var='fb->samples', c_format='%u'), - Arg(type='uint16_t', name='numberOfBins', var='fb->tile_count.width * fb->tile_count.height', c_format='%u'), - Arg(type='uint16_t', name='binWidth', var='fb->tile0.width', c_format='%u'), - Arg(type='uint16_t', name='binHeight', var='fb->tile0.height', c_format='%u')]) + Arg(type='uint16_t', name='numberOfBins', var='tiling->tile_count.width * tiling->tile_count.height', c_format='%u'), + Arg(type='uint16_t', name='binWidth', var='tiling->tile0.width', c_format='%u'), + Arg(type='uint16_t', name='binHeight', var='tiling->tile0.height', c_format='%u')]) begin_end_tp('binning_ib') begin_end_tp('draw_ib_sysmem') diff --git a/src/freedreno/vulkan/tu_util.c b/src/freedreno/vulkan/tu_util.c index 5be9100702b19..970c8a4fe676b 100644 --- a/src/freedreno/vulkan/tu_util.c +++ b/src/freedreno/vulkan/tu_util.c @@ -82,19 +82,21 @@ __vk_startup_errorf(struct tu_instance *instance, static void tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb, const struct tu_device *dev, - const struct tu_render_pass *pass) + const struct tu_render_pass *pass, + enum tu_gmem_layout gmem_layout) { const uint32_t tile_align_w = pass->tile_align_w; const uint32_t tile_align_h = dev->physical_device->info->tile_align_h; const uint32_t max_tile_width = dev->physical_device->info->tile_max_w; const uint32_t max_tile_height = dev->physical_device->info->tile_max_h; + struct tu_tiling_config *tiling = &fb->tiling[gmem_layout]; /* start from 1 tile */ - fb->tile_count = (VkExtent2D) { + tiling->tile_count = (VkExtent2D) { .width = 1, .height = 1, }; - fb->tile0 = (VkExtent2D) { + tiling->tile0 = (VkExtent2D) { .width = util_align_npot(fb->width, tile_align_w), .height = align(fb->height, tile_align_h), }; @@ -102,138 +104,138 @@ tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb, /* will force to sysmem, don't bother trying to have a valid tile config * TODO: just skip all GMEM stuff when sysmem is forced? */ - if (!pass->gmem_pixels) + if (!pass->gmem_pixels[gmem_layout]) return; if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) { /* start with 2x2 tiles */ - fb->tile_count.width = 2; - fb->tile_count.height = 2; - fb->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w); - fb->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h); + tiling->tile_count.width = 2; + tiling->tile_count.height = 2; + tiling->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w); + tiling->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h); } /* do not exceed max tile width */ - while (fb->tile0.width > max_tile_width) { - fb->tile_count.width++; - fb->tile0.width = - util_align_npot(DIV_ROUND_UP(fb->width, fb->tile_count.width), tile_align_w); + while (tiling->tile0.width > max_tile_width) { + tiling->tile_count.width++; + tiling->tile0.width = + util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w); } /* do not exceed max tile height */ - while (fb->tile0.height > max_tile_height) { - fb->tile_count.height++; - fb->tile0.height = - util_align_npot(DIV_ROUND_UP(fb->height, fb->tile_count.height), tile_align_h); + while (tiling->tile0.height > max_tile_height) { + tiling->tile_count.height++; + tiling->tile0.height = + util_align_npot(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h); } /* do not exceed gmem size */ - while (fb->tile0.width * fb->tile0.height > pass->gmem_pixels) { - if (fb->tile0.width > MAX2(tile_align_w, fb->tile0.height)) { - fb->tile_count.width++; - fb->tile0.width = - util_align_npot(DIV_ROUND_UP(fb->width, fb->tile_count.width), tile_align_w); + while (tiling->tile0.width * tiling->tile0.height > pass->gmem_pixels[gmem_layout]) { + if (tiling->tile0.width > MAX2(tile_align_w, tiling->tile0.height)) { + tiling->tile_count.width++; + tiling->tile0.width = + util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w); } else { /* if this assert fails then layout is impossible.. */ - assert(fb->tile0.height > tile_align_h); - fb->tile_count.height++; - fb->tile0.height = - align(DIV_ROUND_UP(fb->height, fb->tile_count.height), tile_align_h); + assert(tiling->tile0.height > tile_align_h); + tiling->tile_count.height++; + tiling->tile0.height = + align(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h); } } } static void -tu_tiling_config_update_pipe_layout(struct tu_framebuffer *fb, +tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling, const struct tu_device *dev) { const uint32_t max_pipe_count = 32; /* A6xx */ /* start from 1 tile per pipe */ - fb->pipe0 = (VkExtent2D) { + tiling->pipe0 = (VkExtent2D) { .width = 1, .height = 1, }; - fb->pipe_count = fb->tile_count; + tiling->pipe_count = tiling->tile_count; - while (fb->pipe_count.width * fb->pipe_count.height > max_pipe_count) { - if (fb->pipe0.width < fb->pipe0.height) { - fb->pipe0.width += 1; - fb->pipe_count.width = - DIV_ROUND_UP(fb->tile_count.width, fb->pipe0.width); + while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) { + if (tiling->pipe0.width < tiling->pipe0.height) { + tiling->pipe0.width += 1; + tiling->pipe_count.width = + DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width); } else { - fb->pipe0.height += 1; - fb->pipe_count.height = - DIV_ROUND_UP(fb->tile_count.height, fb->pipe0.height); + tiling->pipe0.height += 1; + tiling->pipe_count.height = + DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height); } } } static void -tu_tiling_config_update_pipes(struct tu_framebuffer *fb, +tu_tiling_config_update_pipes(struct tu_tiling_config *tiling, const struct tu_device *dev) { const uint32_t max_pipe_count = 32; /* A6xx */ const uint32_t used_pipe_count = - fb->pipe_count.width * fb->pipe_count.height; + tiling->pipe_count.width * tiling->pipe_count.height; const VkExtent2D last_pipe = { - .width = (fb->tile_count.width - 1) % fb->pipe0.width + 1, - .height = (fb->tile_count.height - 1) % fb->pipe0.height + 1, + .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1, + .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1, }; assert(used_pipe_count <= max_pipe_count); - assert(max_pipe_count <= ARRAY_SIZE(fb->pipe_config)); + assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config)); - for (uint32_t y = 0; y < fb->pipe_count.height; y++) { - for (uint32_t x = 0; x < fb->pipe_count.width; x++) { - const uint32_t pipe_x = fb->pipe0.width * x; - const uint32_t pipe_y = fb->pipe0.height * y; - const uint32_t pipe_w = (x == fb->pipe_count.width - 1) + for (uint32_t y = 0; y < tiling->pipe_count.height; y++) { + for (uint32_t x = 0; x < tiling->pipe_count.width; x++) { + const uint32_t pipe_x = tiling->pipe0.width * x; + const uint32_t pipe_y = tiling->pipe0.height * y; + const uint32_t pipe_w = (x == tiling->pipe_count.width - 1) ? last_pipe.width - : fb->pipe0.width; - const uint32_t pipe_h = (y == fb->pipe_count.height - 1) + : tiling->pipe0.width; + const uint32_t pipe_h = (y == tiling->pipe_count.height - 1) ? last_pipe.height - : fb->pipe0.height; - const uint32_t n = fb->pipe_count.width * y + x; + : tiling->pipe0.height; + const uint32_t n = tiling->pipe_count.width * y + x; - fb->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) | + tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) | A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) | A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) | A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h); - fb->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h); + tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h); } } - memset(fb->pipe_config + used_pipe_count, 0, + memset(tiling->pipe_config + used_pipe_count, 0, sizeof(uint32_t) * (max_pipe_count - used_pipe_count)); } static bool -is_hw_binning_possible(const struct tu_framebuffer *fb) +is_hw_binning_possible(const struct tu_tiling_config *tiling) { /* Similar to older gens, # of tiles per pipe cannot be more than 32. * But there are no hangs with 16 or more tiles per pipe in either * X or Y direction, so that limit does not seem to apply. */ - uint32_t tiles_per_pipe = fb->pipe0.width * fb->pipe0.height; + uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height; return tiles_per_pipe <= 32; } static void -tu_tiling_config_update_binning(struct tu_framebuffer *fb, const struct tu_device *device) +tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device) { - fb->binning_possible = is_hw_binning_possible(fb); + tiling->binning_possible = is_hw_binning_possible(tiling); - if (fb->binning_possible) { - fb->binning = (fb->tile_count.width * fb->tile_count.height) > 2; + if (tiling->binning_possible) { + tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2; if (unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) - fb->binning = true; + tiling->binning = true; if (unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN)) - fb->binning = false; + tiling->binning = false; } else { - fb->binning = false; + tiling->binning = false; } } @@ -242,10 +244,13 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb, const struct tu_device *device, const struct tu_render_pass *pass) { - tu_tiling_config_update_tile_layout(fb, device, pass); - tu_tiling_config_update_pipe_layout(fb, device); - tu_tiling_config_update_pipes(fb, device); - tu_tiling_config_update_binning(fb, device); + for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) { + struct tu_tiling_config *tiling = &fb->tiling[gmem_layout]; + tu_tiling_config_update_tile_layout(fb, device, pass, gmem_layout); + tu_tiling_config_update_pipe_layout(tiling, device); + tu_tiling_config_update_pipes(tiling, device); + tu_tiling_config_update_binning(tiling, device); + } } void