tu: Split out some state into a separate struct

These bits of state will have to be treated specially when
suspending/resuming a render pass, because they will need to be tracked
across command buffers.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17378>
This commit is contained in:
Connor Abbott 2022-06-29 11:03:25 +02:00 committed by Marge Bot
parent 9689433eee
commit 3aa20a4409
4 changed files with 86 additions and 83 deletions

View File

@ -480,7 +480,7 @@ fallback_use_bypass(const struct tu_render_pass *pass,
const struct tu_framebuffer *framebuffer,
const struct tu_cmd_buffer *cmd_buffer)
{
if (cmd_buffer->state.drawcall_count > 5)
if (cmd_buffer->state.rp.drawcall_count > 5)
return false;
for (unsigned i = 0; i < pass->subpass_count; i++) {
@ -504,12 +504,12 @@ estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
{
const struct tu_cmd_state *state = &cmd->state;
if (!state->drawcall_count)
if (!state->rp.drawcall_count)
return 0;
/* sample count times drawcall_bandwidth_per_sample */
return (uint64_t)avg_renderpass_sample_count *
state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
}
bool
@ -583,12 +583,12 @@ tu_autotune_use_bypass(struct tu_autotune *at,
if (TU_AUTOTUNE_DEBUG_LOG) {
const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
const float drawcall_bandwidth_per_sample =
(float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
cmd_buffer->state.drawcall_count;
(float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
cmd_buffer->state.rp.drawcall_count;
mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
renderpass_key,
cmd_buffer->state.drawcall_count,
cmd_buffer->state.rp.drawcall_count,
select_sysmem ? "sysmem" : "gmem");
mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
avg_samples,

View File

@ -569,7 +569,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
* use_sysmem_rendering() should have made sure we only ended up here if no
* XFB was used.
*/
if (cmd->state.xfb_used) {
if (cmd->state.rp.xfb_used) {
assert(fb->binning_possible);
return true;
}
@ -579,7 +579,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
* be multiplied by tile count.
* See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131
*/
if (cmd->state.has_prim_generated_query_in_rp ||
if (cmd->state.rp.has_prim_generated_query_in_rp ||
cmd->state.prim_generated_query_running_before_rp) {
assert(fb->binning_possible);
return true;
@ -607,20 +607,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
cmd->state.render_area.extent.height == 0)
return true;
if (cmd->state.has_tess)
if (cmd->state.rp.has_tess)
return true;
if (cmd->state.disable_gmem)
if (cmd->state.rp.disable_gmem)
return true;
/* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
if (cmd->state.xfb_used && !cmd->state.framebuffer->binning_possible)
if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible)
return true;
/* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
* GMEM rendering, see use_hw_binning.
*/
if ((cmd->state.has_prim_generated_query_in_rp ||
if ((cmd->state.rp.has_prim_generated_query_in_rp ||
cmd->state.prim_generated_query_running_before_rp) &&
!cmd->state.framebuffer->binning_possible)
return true;
@ -1411,7 +1411,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
}
/* Predicate is changed in draw_cs so we have to re-emit it */
if (cmd->state.draw_cs_writes_to_cond_pred)
if (cmd->state.rp.draw_cs_writes_to_cond_pred)
tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
tu_cs_emit_call(cs, &cmd->tile_store_cs);
@ -2270,7 +2270,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
tu_cond_exec_end(cs);
cmd->state.xfb_used = true;
cmd->state.rp.xfb_used = true;
}
VKAPI_ATTR void VKAPI_CALL
@ -2396,7 +2396,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
}
if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
cmd->state.has_tess = true;
cmd->state.rp.has_tess = true;
/* maximum number of patches that can fit in tess factor/param buffers */
uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
@ -3331,6 +3331,21 @@ tu_flush_for_stage(struct tu_cache_state *cache,
}
}
static void
tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src)
{
dst->xfb_used |= src->xfb_used;
dst->has_tess |= src->has_tess;
dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
dst->disable_gmem |= src->disable_gmem;
dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
dst->drawcall_count += src->drawcall_count;
dst->drawcall_bandwidth_per_sample_sum +=
src->drawcall_bandwidth_per_sample_sum;
}
VKAPI_ATTR void VKAPI_CALL
tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
uint32_t commandBufferCount,
@ -3370,28 +3385,13 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
break;
}
if (secondary->state.has_tess) {
cmd->state.has_tess = true;
}
if (secondary->state.disable_gmem)
cmd->state.disable_gmem = true;
if (secondary->state.xfb_used)
cmd->state.xfb_used = true;
if (secondary->state.has_prim_generated_query_in_rp)
cmd->state.has_prim_generated_query_in_rp = true;
cmd->state.drawcall_count += secondary->state.drawcall_count;
cmd->state.drawcall_bandwidth_per_sample_sum +=
secondary->state.drawcall_bandwidth_per_sample_sum;
cmd->state.draw_cs_writes_to_cond_pred |=
secondary->state.draw_cs_writes_to_cond_pred;
/* If LRZ was made invalid in secondary - we should disable
* LRZ retroactively for the whole renderpass.
*/
if (!secondary->state.lrz.valid)
cmd->state.lrz.valid = false;
tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
} else {
assert(tu_cs_is_empty(&secondary->draw_cs));
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
@ -3588,8 +3588,6 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
return;
}
cmd->state.draw_cs_writes_to_cond_pred = false;
for (unsigned i = 0; i < pass->attachment_count; i++) {
cmd->state.attachments[i] = pAttachmentInfo ?
tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
@ -3635,8 +3633,6 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
cmd->state.attachments = cmd->dynamic_attachments;
cmd->state.draw_cs_writes_to_cond_pred = false;
for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment;
if (!pRenderingInfo->pColorAttachments[i].imageView)
@ -4020,23 +4016,23 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
const struct tu_pipeline *pipeline = cmd->state.pipeline;
/* Fill draw stats for autotuner */
cmd->state.drawcall_count++;
cmd->state.rp.drawcall_count++;
cmd->state.drawcall_bandwidth_per_sample_sum +=
cmd->state.rp.drawcall_bandwidth_per_sample_sum +=
cmd->state.pipeline->color_bandwidth_per_sample;
/* add depth memory bandwidth cost */
const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
/* add stencil memory bandwidth cost */
const uint32_t stencil_bandwidth =
cmd->state.pipeline->stencil_cpp_per_sample;
if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
cmd->state.rp.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
tu_emit_cache_flush_renderpass(cmd, cs);
@ -4813,7 +4809,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
if (cmd_buffer->state.has_tess)
if (cmd_buffer->state.rp.has_tess)
tu6_lazy_emit_tessfactor_addr(cmd_buffer);
struct tu_renderpass_result *autotune_result = NULL;
@ -4839,12 +4835,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
cmd_buffer->state.subpass = NULL;
cmd_buffer->state.framebuffer = NULL;
cmd_buffer->state.attachments = NULL;
cmd_buffer->state.has_tess = false;
cmd_buffer->state.xfb_used = false;
cmd_buffer->state.disable_gmem = false;
cmd_buffer->state.drawcall_count = 0;
cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
cmd_buffer->state.has_prim_generated_query_in_rp = false;
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
/* LRZ is not valid next time we use it */
cmd_buffer->state.lrz.valid = false;
@ -4969,7 +4960,7 @@ tu_barrier(struct tu_cmd_buffer *cmd,
*/
if ((srcStage & ~framebuffer_space_stages) ||
(dstStage & ~framebuffer_space_stages)) {
cmd->state.disable_gmem = true;
cmd->state.rp.disable_gmem = true;
}
}

View File

@ -1285,6 +1285,48 @@ struct tu_vs_params {
uint32_t first_instance;
};
/* This should be for state that is set inside a renderpass and used at
* renderpass end time, e.g. to decide whether to use sysmem. This needs
* special handling for secondary cmdbufs and suspending/resuming render
* passes where the state may need to be combined afterwards.
*/
struct tu_render_pass_state
{
bool xfb_used;
bool has_tess;
bool has_prim_generated_query_in_rp;
bool disable_gmem;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
uint32_t drawcall_count;
/* A calculated "draw cost" value for renderpass, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:
*
* foreach_draw (...) {
* sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (stencil_write_enabled)
* sum += pipeline->stencil_cpp_per_sample * 2;
* }
* drawcall_bandwidth_per_sample = sum / drawcall_count;
*
* It allows us to estimate the total bandwidth of drawcalls later, by
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
* pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
uint32_t drawcall_bandwidth_per_sample_sum;
};
struct tu_cmd_state
{
uint32_t dirty;
@ -1292,6 +1334,8 @@ struct tu_cmd_state
struct tu_pipeline *pipeline;
struct tu_pipeline *compute_pipeline;
struct tu_render_pass_state rp;
/* Vertex buffers, viewports, and scissors
* the states for these can be updated partially, so we need to save these
* to be able to emit a complete draw state
@ -1358,43 +1402,12 @@ struct tu_cmd_state
VkRect2D render_area;
const struct tu_image_view **attachments;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
bool xfb_used;
bool has_tess;
bool tessfactor_addr_set;
bool predication_active;
bool disable_gmem;
enum a5xx_line_mode line_mode;
bool z_negative_one_to_one;
uint32_t drawcall_count;
/* A calculated "draw cost" value for renderpass, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:
*
* foreach_draw (...) {
* sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (stencil_write_enabled)
* sum += pipeline->stencil_cpp_per_sample * 2;
* }
* drawcall_bandwidth_per_sample = sum / drawcall_count;
*
* It allows us to estimate the total bandwidth of drawcalls later, by
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
* pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
uint32_t drawcall_bandwidth_per_sample_sum;
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
@ -1402,7 +1415,6 @@ struct tu_cmd_state
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
bool has_prim_generated_query_in_rp;
struct tu_lrz_state lrz;

View File

@ -932,7 +932,7 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
uint32_t last_pass = ~0;
if (cmdbuf->state.pass) {
cmdbuf->state.draw_cs_writes_to_cond_pred = true;
cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
}
/* Querying perf counters happens in these steps:
@ -1023,7 +1023,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
if (cmdbuf->state.pass) {
cmdbuf->state.has_prim_generated_query_in_rp = true;
cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
} else {
cmdbuf->state.prim_generated_query_running_before_rp = true;
}