From 3aa20a4409eedce70acc282cec12f5af12498b33 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Wed, 29 Jun 2022 11:03:25 +0200 Subject: [PATCH] tu: Split out some state into a separate struct These bits of state will have to be treated specially when suspending/resuming a render pass, because they will need to be tracked across command buffers. Part-of: --- src/freedreno/vulkan/tu_autotune.c | 12 ++--- src/freedreno/vulkan/tu_cmd_buffer.c | 77 ++++++++++++---------------- src/freedreno/vulkan/tu_private.h | 76 +++++++++++++++------------ src/freedreno/vulkan/tu_query.c | 4 +- 4 files changed, 86 insertions(+), 83 deletions(-) diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c index 80c732ce972..4feee5430c7 100644 --- a/src/freedreno/vulkan/tu_autotune.c +++ b/src/freedreno/vulkan/tu_autotune.c @@ -480,7 +480,7 @@ fallback_use_bypass(const struct tu_render_pass *pass, const struct tu_framebuffer *framebuffer, const struct tu_cmd_buffer *cmd_buffer) { - if (cmd_buffer->state.drawcall_count > 5) + if (cmd_buffer->state.rp.drawcall_count > 5) return false; for (unsigned i = 0; i < pass->subpass_count; i++) { @@ -504,12 +504,12 @@ estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd, { const struct tu_cmd_state *state = &cmd->state; - if (!state->drawcall_count) + if (!state->rp.drawcall_count) return 0; /* sample count times drawcall_bandwidth_per_sample */ return (uint64_t)avg_renderpass_sample_count * - state->drawcall_bandwidth_per_sample_sum / state->drawcall_count; + state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count; } bool @@ -583,12 +583,12 @@ tu_autotune_use_bypass(struct tu_autotune *at, if (TU_AUTOTUNE_DEBUG_LOG) { const VkExtent2D *extent = &cmd_buffer->state.render_area.extent; const float drawcall_bandwidth_per_sample = - (float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum / - cmd_buffer->state.drawcall_count; + (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum / + cmd_buffer->state.rp.drawcall_count; mesa_logi("autotune %016" PRIx64 ":%u selecting %s", renderpass_key, - cmd_buffer->state.drawcall_count, + cmd_buffer->state.rp.drawcall_count, select_sysmem ? "sysmem" : "gmem"); mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64, avg_samples, diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 3b45a7a72bb..2efe79ee4eb 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -569,7 +569,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd) * use_sysmem_rendering() should have made sure we only ended up here if no * XFB was used. */ - if (cmd->state.xfb_used) { + if (cmd->state.rp.xfb_used) { assert(fb->binning_possible); return true; } @@ -579,7 +579,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd) * be multiplied by tile count. * See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131 */ - if (cmd->state.has_prim_generated_query_in_rp || + if (cmd->state.rp.has_prim_generated_query_in_rp || cmd->state.prim_generated_query_running_before_rp) { assert(fb->binning_possible); return true; @@ -607,20 +607,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, cmd->state.render_area.extent.height == 0) return true; - if (cmd->state.has_tess) + if (cmd->state.rp.has_tess) return true; - if (cmd->state.disable_gmem) + if (cmd->state.rp.disable_gmem) return true; /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */ - if (cmd->state.xfb_used && !cmd->state.framebuffer->binning_possible) + if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible) return true; /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning * GMEM rendering, see use_hw_binning. */ - if ((cmd->state.has_prim_generated_query_in_rp || + if ((cmd->state.rp.has_prim_generated_query_in_rp || cmd->state.prim_generated_query_running_before_rp) && !cmd->state.framebuffer->binning_possible) return true; @@ -1411,7 +1411,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, } /* Predicate is changed in draw_cs so we have to re-emit it */ - if (cmd->state.draw_cs_writes_to_cond_pred) + if (cmd->state.rp.draw_cs_writes_to_cond_pred) tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false); tu_cs_emit_call(cs, &cmd->tile_store_cs); @@ -2270,7 +2270,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, tu_cond_exec_end(cs); - cmd->state.xfb_used = true; + cmd->state.rp.xfb_used = true; } VKAPI_ATTR void VKAPI_CALL @@ -2396,7 +2396,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, } if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) { - cmd->state.has_tess = true; + cmd->state.rp.has_tess = true; /* maximum number of patches that can fit in tess factor/param buffers */ uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type), @@ -3331,6 +3331,21 @@ tu_flush_for_stage(struct tu_cache_state *cache, } } +static void +tu_render_pass_state_merge(struct tu_render_pass_state *dst, + const struct tu_render_pass_state *src) +{ + dst->xfb_used |= src->xfb_used; + dst->has_tess |= src->has_tess; + dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp; + dst->disable_gmem |= src->disable_gmem; + dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred; + + dst->drawcall_count += src->drawcall_count; + dst->drawcall_bandwidth_per_sample_sum += + src->drawcall_bandwidth_per_sample_sum; +} + VKAPI_ATTR void VKAPI_CALL tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, @@ -3370,28 +3385,13 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, break; } - if (secondary->state.has_tess) { - cmd->state.has_tess = true; - } - if (secondary->state.disable_gmem) - cmd->state.disable_gmem = true; - if (secondary->state.xfb_used) - cmd->state.xfb_used = true; - if (secondary->state.has_prim_generated_query_in_rp) - cmd->state.has_prim_generated_query_in_rp = true; - - cmd->state.drawcall_count += secondary->state.drawcall_count; - cmd->state.drawcall_bandwidth_per_sample_sum += - secondary->state.drawcall_bandwidth_per_sample_sum; - - cmd->state.draw_cs_writes_to_cond_pred |= - secondary->state.draw_cs_writes_to_cond_pred; - /* If LRZ was made invalid in secondary - we should disable * LRZ retroactively for the whole renderpass. */ if (!secondary->state.lrz.valid) cmd->state.lrz.valid = false; + + tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp); } else { assert(tu_cs_is_empty(&secondary->draw_cs)); assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); @@ -3588,8 +3588,6 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, return; } - cmd->state.draw_cs_writes_to_cond_pred = false; - for (unsigned i = 0; i < pass->attachment_count; i++) { cmd->state.attachments[i] = pAttachmentInfo ? tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) : @@ -3635,8 +3633,6 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer, cmd->state.attachments = cmd->dynamic_attachments; - cmd->state.draw_cs_writes_to_cond_pred = false; - for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) { uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment; if (!pRenderingInfo->pColorAttachments[i].imageView) @@ -4020,23 +4016,23 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, const struct tu_pipeline *pipeline = cmd->state.pipeline; /* Fill draw stats for autotuner */ - cmd->state.drawcall_count++; + cmd->state.rp.drawcall_count++; - cmd->state.drawcall_bandwidth_per_sample_sum += + cmd->state.rp.drawcall_bandwidth_per_sample_sum += cmd->state.pipeline->color_bandwidth_per_sample; /* add depth memory bandwidth cost */ const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample; if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE) - cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth; + cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth; if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) - cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth; + cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth; /* add stencil memory bandwidth cost */ const uint32_t stencil_bandwidth = cmd->state.pipeline->stencil_cpp_per_sample; if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE) - cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2; + cmd->state.rp.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2; tu_emit_cache_flush_renderpass(cmd, cs); @@ -4813,7 +4809,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer) cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace); - if (cmd_buffer->state.has_tess) + if (cmd_buffer->state.rp.has_tess) tu6_lazy_emit_tessfactor_addr(cmd_buffer); struct tu_renderpass_result *autotune_result = NULL; @@ -4839,12 +4835,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer) cmd_buffer->state.subpass = NULL; cmd_buffer->state.framebuffer = NULL; cmd_buffer->state.attachments = NULL; - cmd_buffer->state.has_tess = false; - cmd_buffer->state.xfb_used = false; - cmd_buffer->state.disable_gmem = false; - cmd_buffer->state.drawcall_count = 0; - cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0; - cmd_buffer->state.has_prim_generated_query_in_rp = false; + memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp)); /* LRZ is not valid next time we use it */ cmd_buffer->state.lrz.valid = false; @@ -4969,7 +4960,7 @@ tu_barrier(struct tu_cmd_buffer *cmd, */ if ((srcStage & ~framebuffer_space_stages) || (dstStage & ~framebuffer_space_stages)) { - cmd->state.disable_gmem = true; + cmd->state.rp.disable_gmem = true; } } diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index d73a8344164..55afea180b1 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -1285,6 +1285,48 @@ struct tu_vs_params { uint32_t first_instance; }; +/* This should be for state that is set inside a renderpass and used at + * renderpass end time, e.g. to decide whether to use sysmem. This needs + * special handling for secondary cmdbufs and suspending/resuming render + * passes where the state may need to be combined afterwards. + */ +struct tu_render_pass_state +{ + bool xfb_used; + bool has_tess; + bool has_prim_generated_query_in_rp; + bool disable_gmem; + + /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */ + bool draw_cs_writes_to_cond_pred; + + uint32_t drawcall_count; + + /* A calculated "draw cost" value for renderpass, which tries to + * estimate the bandwidth-per-sample of all the draws according + * to: + * + * foreach_draw (...) { + * sum += pipeline->color_bandwidth_per_sample; + * if (depth_test_enabled) + * sum += pipeline->depth_cpp_per_sample; + * if (depth_write_enabled) + * sum += pipeline->depth_cpp_per_sample; + * if (stencil_write_enabled) + * sum += pipeline->stencil_cpp_per_sample * 2; + * } + * drawcall_bandwidth_per_sample = sum / drawcall_count; + * + * It allows us to estimate the total bandwidth of drawcalls later, by + * calculating (drawcall_bandwidth_per_sample * zpass_sample_count). + * + * This does ignore depth buffer traffic for samples which do not + * pass due to depth-test fail, and some other details. But it is + * just intended to be a rough estimate that is easy to calculate. + */ + uint32_t drawcall_bandwidth_per_sample_sum; +}; + struct tu_cmd_state { uint32_t dirty; @@ -1292,6 +1334,8 @@ struct tu_cmd_state struct tu_pipeline *pipeline; struct tu_pipeline *compute_pipeline; + struct tu_render_pass_state rp; + /* Vertex buffers, viewports, and scissors * the states for these can be updated partially, so we need to save these * to be able to emit a complete draw state @@ -1358,43 +1402,12 @@ struct tu_cmd_state VkRect2D render_area; const struct tu_image_view **attachments; - /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */ - bool draw_cs_writes_to_cond_pred; - bool xfb_used; - bool has_tess; bool tessfactor_addr_set; bool predication_active; - bool disable_gmem; enum a5xx_line_mode line_mode; bool z_negative_one_to_one; - uint32_t drawcall_count; - - /* A calculated "draw cost" value for renderpass, which tries to - * estimate the bandwidth-per-sample of all the draws according - * to: - * - * foreach_draw (...) { - * sum += pipeline->color_bandwidth_per_sample; - * if (depth_test_enabled) - * sum += pipeline->depth_cpp_per_sample; - * if (depth_write_enabled) - * sum += pipeline->depth_cpp_per_sample; - * if (stencil_write_enabled) - * sum += pipeline->stencil_cpp_per_sample * 2; - * } - * drawcall_bandwidth_per_sample = sum / drawcall_count; - * - * It allows us to estimate the total bandwidth of drawcalls later, by - * calculating (drawcall_bandwidth_per_sample * zpass_sample_count). - * - * This does ignore depth buffer traffic for samples which do not - * pass due to depth-test fail, and some other details. But it is - * just intended to be a rough estimate that is easy to calculate. - */ - uint32_t drawcall_bandwidth_per_sample_sum; - /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously, * but they use the same {START,STOP}_PRIMITIVE_CTRS control. @@ -1402,7 +1415,6 @@ struct tu_cmd_state uint32_t prim_counters_running; bool prim_generated_query_running_before_rp; - bool has_prim_generated_query_in_rp; struct tu_lrz_state lrz; diff --git a/src/freedreno/vulkan/tu_query.c b/src/freedreno/vulkan/tu_query.c index 3a13d37bce6..3ce688d5633 100644 --- a/src/freedreno/vulkan/tu_query.c +++ b/src/freedreno/vulkan/tu_query.c @@ -932,7 +932,7 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf, uint32_t last_pass = ~0; if (cmdbuf->state.pass) { - cmdbuf->state.draw_cs_writes_to_cond_pred = true; + cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true; } /* Querying perf counters happens in these steps: @@ -1023,7 +1023,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin); if (cmdbuf->state.pass) { - cmdbuf->state.has_prim_generated_query_in_rp = true; + cmdbuf->state.rp.has_prim_generated_query_in_rp = true; } else { cmdbuf->state.prim_generated_query_running_before_rp = true; }