tu: Split out some state into a separate struct
These bits of state will have to be treated specially when suspending/resuming a render pass, because they will need to be tracked across command buffers. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17378>
This commit is contained in:
parent
9689433eee
commit
3aa20a4409
|
@ -480,7 +480,7 @@ fallback_use_bypass(const struct tu_render_pass *pass,
|
|||
const struct tu_framebuffer *framebuffer,
|
||||
const struct tu_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
if (cmd_buffer->state.drawcall_count > 5)
|
||||
if (cmd_buffer->state.rp.drawcall_count > 5)
|
||||
return false;
|
||||
|
||||
for (unsigned i = 0; i < pass->subpass_count; i++) {
|
||||
|
@ -504,12 +504,12 @@ estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
|
|||
{
|
||||
const struct tu_cmd_state *state = &cmd->state;
|
||||
|
||||
if (!state->drawcall_count)
|
||||
if (!state->rp.drawcall_count)
|
||||
return 0;
|
||||
|
||||
/* sample count times drawcall_bandwidth_per_sample */
|
||||
return (uint64_t)avg_renderpass_sample_count *
|
||||
state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
|
||||
state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
|
||||
}
|
||||
|
||||
bool
|
||||
|
@ -583,12 +583,12 @@ tu_autotune_use_bypass(struct tu_autotune *at,
|
|||
if (TU_AUTOTUNE_DEBUG_LOG) {
|
||||
const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
|
||||
const float drawcall_bandwidth_per_sample =
|
||||
(float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
|
||||
cmd_buffer->state.drawcall_count;
|
||||
(float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
|
||||
cmd_buffer->state.rp.drawcall_count;
|
||||
|
||||
mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
|
||||
renderpass_key,
|
||||
cmd_buffer->state.drawcall_count,
|
||||
cmd_buffer->state.rp.drawcall_count,
|
||||
select_sysmem ? "sysmem" : "gmem");
|
||||
mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
|
||||
avg_samples,
|
||||
|
|
|
@ -569,7 +569,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
|
|||
* use_sysmem_rendering() should have made sure we only ended up here if no
|
||||
* XFB was used.
|
||||
*/
|
||||
if (cmd->state.xfb_used) {
|
||||
if (cmd->state.rp.xfb_used) {
|
||||
assert(fb->binning_possible);
|
||||
return true;
|
||||
}
|
||||
|
@ -579,7 +579,7 @@ use_hw_binning(struct tu_cmd_buffer *cmd)
|
|||
* be multiplied by tile count.
|
||||
* See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131
|
||||
*/
|
||||
if (cmd->state.has_prim_generated_query_in_rp ||
|
||||
if (cmd->state.rp.has_prim_generated_query_in_rp ||
|
||||
cmd->state.prim_generated_query_running_before_rp) {
|
||||
assert(fb->binning_possible);
|
||||
return true;
|
||||
|
@ -607,20 +607,20 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd,
|
|||
cmd->state.render_area.extent.height == 0)
|
||||
return true;
|
||||
|
||||
if (cmd->state.has_tess)
|
||||
if (cmd->state.rp.has_tess)
|
||||
return true;
|
||||
|
||||
if (cmd->state.disable_gmem)
|
||||
if (cmd->state.rp.disable_gmem)
|
||||
return true;
|
||||
|
||||
/* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
|
||||
if (cmd->state.xfb_used && !cmd->state.framebuffer->binning_possible)
|
||||
if (cmd->state.rp.xfb_used && !cmd->state.framebuffer->binning_possible)
|
||||
return true;
|
||||
|
||||
/* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
|
||||
* GMEM rendering, see use_hw_binning.
|
||||
*/
|
||||
if ((cmd->state.has_prim_generated_query_in_rp ||
|
||||
if ((cmd->state.rp.has_prim_generated_query_in_rp ||
|
||||
cmd->state.prim_generated_query_running_before_rp) &&
|
||||
!cmd->state.framebuffer->binning_possible)
|
||||
return true;
|
||||
|
@ -1411,7 +1411,7 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
}
|
||||
|
||||
/* Predicate is changed in draw_cs so we have to re-emit it */
|
||||
if (cmd->state.draw_cs_writes_to_cond_pred)
|
||||
if (cmd->state.rp.draw_cs_writes_to_cond_pred)
|
||||
tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
|
||||
|
||||
tu_cs_emit_call(cs, &cmd->tile_store_cs);
|
||||
|
@ -2270,7 +2270,7 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
|
|||
|
||||
tu_cond_exec_end(cs);
|
||||
|
||||
cmd->state.xfb_used = true;
|
||||
cmd->state.rp.xfb_used = true;
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
|
@ -2396,7 +2396,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
|
|||
}
|
||||
|
||||
if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
|
||||
cmd->state.has_tess = true;
|
||||
cmd->state.rp.has_tess = true;
|
||||
|
||||
/* maximum number of patches that can fit in tess factor/param buffers */
|
||||
uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
|
||||
|
@ -3331,6 +3331,21 @@ tu_flush_for_stage(struct tu_cache_state *cache,
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
tu_render_pass_state_merge(struct tu_render_pass_state *dst,
|
||||
const struct tu_render_pass_state *src)
|
||||
{
|
||||
dst->xfb_used |= src->xfb_used;
|
||||
dst->has_tess |= src->has_tess;
|
||||
dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
|
||||
dst->disable_gmem |= src->disable_gmem;
|
||||
dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
|
||||
|
||||
dst->drawcall_count += src->drawcall_count;
|
||||
dst->drawcall_bandwidth_per_sample_sum +=
|
||||
src->drawcall_bandwidth_per_sample_sum;
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
|
||||
uint32_t commandBufferCount,
|
||||
|
@ -3370,28 +3385,13 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
|
|||
break;
|
||||
}
|
||||
|
||||
if (secondary->state.has_tess) {
|
||||
cmd->state.has_tess = true;
|
||||
}
|
||||
if (secondary->state.disable_gmem)
|
||||
cmd->state.disable_gmem = true;
|
||||
if (secondary->state.xfb_used)
|
||||
cmd->state.xfb_used = true;
|
||||
if (secondary->state.has_prim_generated_query_in_rp)
|
||||
cmd->state.has_prim_generated_query_in_rp = true;
|
||||
|
||||
cmd->state.drawcall_count += secondary->state.drawcall_count;
|
||||
cmd->state.drawcall_bandwidth_per_sample_sum +=
|
||||
secondary->state.drawcall_bandwidth_per_sample_sum;
|
||||
|
||||
cmd->state.draw_cs_writes_to_cond_pred |=
|
||||
secondary->state.draw_cs_writes_to_cond_pred;
|
||||
|
||||
/* If LRZ was made invalid in secondary - we should disable
|
||||
* LRZ retroactively for the whole renderpass.
|
||||
*/
|
||||
if (!secondary->state.lrz.valid)
|
||||
cmd->state.lrz.valid = false;
|
||||
|
||||
tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
|
||||
} else {
|
||||
assert(tu_cs_is_empty(&secondary->draw_cs));
|
||||
assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
|
||||
|
@ -3588,8 +3588,6 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
|
|||
return;
|
||||
}
|
||||
|
||||
cmd->state.draw_cs_writes_to_cond_pred = false;
|
||||
|
||||
for (unsigned i = 0; i < pass->attachment_count; i++) {
|
||||
cmd->state.attachments[i] = pAttachmentInfo ?
|
||||
tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
|
||||
|
@ -3635,8 +3633,6 @@ tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
|
|||
|
||||
cmd->state.attachments = cmd->dynamic_attachments;
|
||||
|
||||
cmd->state.draw_cs_writes_to_cond_pred = false;
|
||||
|
||||
for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
|
||||
uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment;
|
||||
if (!pRenderingInfo->pColorAttachments[i].imageView)
|
||||
|
@ -4020,23 +4016,23 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
|
|||
const struct tu_pipeline *pipeline = cmd->state.pipeline;
|
||||
|
||||
/* Fill draw stats for autotuner */
|
||||
cmd->state.drawcall_count++;
|
||||
cmd->state.rp.drawcall_count++;
|
||||
|
||||
cmd->state.drawcall_bandwidth_per_sample_sum +=
|
||||
cmd->state.rp.drawcall_bandwidth_per_sample_sum +=
|
||||
cmd->state.pipeline->color_bandwidth_per_sample;
|
||||
|
||||
/* add depth memory bandwidth cost */
|
||||
const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
|
||||
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
|
||||
cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
|
||||
cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
|
||||
if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
|
||||
cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
|
||||
cmd->state.rp.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
|
||||
|
||||
/* add stencil memory bandwidth cost */
|
||||
const uint32_t stencil_bandwidth =
|
||||
cmd->state.pipeline->stencil_cpp_per_sample;
|
||||
if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
|
||||
cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
|
||||
cmd->state.rp.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
|
||||
|
||||
tu_emit_cache_flush_renderpass(cmd, cs);
|
||||
|
||||
|
@ -4813,7 +4809,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
|
|||
|
||||
cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
|
||||
|
||||
if (cmd_buffer->state.has_tess)
|
||||
if (cmd_buffer->state.rp.has_tess)
|
||||
tu6_lazy_emit_tessfactor_addr(cmd_buffer);
|
||||
|
||||
struct tu_renderpass_result *autotune_result = NULL;
|
||||
|
@ -4839,12 +4835,7 @@ tu_end_rendering(struct tu_cmd_buffer *cmd_buffer)
|
|||
cmd_buffer->state.subpass = NULL;
|
||||
cmd_buffer->state.framebuffer = NULL;
|
||||
cmd_buffer->state.attachments = NULL;
|
||||
cmd_buffer->state.has_tess = false;
|
||||
cmd_buffer->state.xfb_used = false;
|
||||
cmd_buffer->state.disable_gmem = false;
|
||||
cmd_buffer->state.drawcall_count = 0;
|
||||
cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
|
||||
cmd_buffer->state.has_prim_generated_query_in_rp = false;
|
||||
memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
|
||||
|
||||
/* LRZ is not valid next time we use it */
|
||||
cmd_buffer->state.lrz.valid = false;
|
||||
|
@ -4969,7 +4960,7 @@ tu_barrier(struct tu_cmd_buffer *cmd,
|
|||
*/
|
||||
if ((srcStage & ~framebuffer_space_stages) ||
|
||||
(dstStage & ~framebuffer_space_stages)) {
|
||||
cmd->state.disable_gmem = true;
|
||||
cmd->state.rp.disable_gmem = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1285,6 +1285,48 @@ struct tu_vs_params {
|
|||
uint32_t first_instance;
|
||||
};
|
||||
|
||||
/* This should be for state that is set inside a renderpass and used at
|
||||
* renderpass end time, e.g. to decide whether to use sysmem. This needs
|
||||
* special handling for secondary cmdbufs and suspending/resuming render
|
||||
* passes where the state may need to be combined afterwards.
|
||||
*/
|
||||
struct tu_render_pass_state
|
||||
{
|
||||
bool xfb_used;
|
||||
bool has_tess;
|
||||
bool has_prim_generated_query_in_rp;
|
||||
bool disable_gmem;
|
||||
|
||||
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
|
||||
bool draw_cs_writes_to_cond_pred;
|
||||
|
||||
uint32_t drawcall_count;
|
||||
|
||||
/* A calculated "draw cost" value for renderpass, which tries to
|
||||
* estimate the bandwidth-per-sample of all the draws according
|
||||
* to:
|
||||
*
|
||||
* foreach_draw (...) {
|
||||
* sum += pipeline->color_bandwidth_per_sample;
|
||||
* if (depth_test_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (depth_write_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (stencil_write_enabled)
|
||||
* sum += pipeline->stencil_cpp_per_sample * 2;
|
||||
* }
|
||||
* drawcall_bandwidth_per_sample = sum / drawcall_count;
|
||||
*
|
||||
* It allows us to estimate the total bandwidth of drawcalls later, by
|
||||
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
|
||||
*
|
||||
* This does ignore depth buffer traffic for samples which do not
|
||||
* pass due to depth-test fail, and some other details. But it is
|
||||
* just intended to be a rough estimate that is easy to calculate.
|
||||
*/
|
||||
uint32_t drawcall_bandwidth_per_sample_sum;
|
||||
};
|
||||
|
||||
struct tu_cmd_state
|
||||
{
|
||||
uint32_t dirty;
|
||||
|
@ -1292,6 +1334,8 @@ struct tu_cmd_state
|
|||
struct tu_pipeline *pipeline;
|
||||
struct tu_pipeline *compute_pipeline;
|
||||
|
||||
struct tu_render_pass_state rp;
|
||||
|
||||
/* Vertex buffers, viewports, and scissors
|
||||
* the states for these can be updated partially, so we need to save these
|
||||
* to be able to emit a complete draw state
|
||||
|
@ -1358,43 +1402,12 @@ struct tu_cmd_state
|
|||
VkRect2D render_area;
|
||||
|
||||
const struct tu_image_view **attachments;
|
||||
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
|
||||
bool draw_cs_writes_to_cond_pred;
|
||||
|
||||
bool xfb_used;
|
||||
bool has_tess;
|
||||
bool tessfactor_addr_set;
|
||||
bool predication_active;
|
||||
bool disable_gmem;
|
||||
enum a5xx_line_mode line_mode;
|
||||
bool z_negative_one_to_one;
|
||||
|
||||
uint32_t drawcall_count;
|
||||
|
||||
/* A calculated "draw cost" value for renderpass, which tries to
|
||||
* estimate the bandwidth-per-sample of all the draws according
|
||||
* to:
|
||||
*
|
||||
* foreach_draw (...) {
|
||||
* sum += pipeline->color_bandwidth_per_sample;
|
||||
* if (depth_test_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (depth_write_enabled)
|
||||
* sum += pipeline->depth_cpp_per_sample;
|
||||
* if (stencil_write_enabled)
|
||||
* sum += pipeline->stencil_cpp_per_sample * 2;
|
||||
* }
|
||||
* drawcall_bandwidth_per_sample = sum / drawcall_count;
|
||||
*
|
||||
* It allows us to estimate the total bandwidth of drawcalls later, by
|
||||
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
|
||||
*
|
||||
* This does ignore depth buffer traffic for samples which do not
|
||||
* pass due to depth-test fail, and some other details. But it is
|
||||
* just intended to be a rough estimate that is easy to calculate.
|
||||
*/
|
||||
uint32_t drawcall_bandwidth_per_sample_sum;
|
||||
|
||||
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
|
||||
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
|
||||
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
|
||||
|
@ -1402,7 +1415,6 @@ struct tu_cmd_state
|
|||
uint32_t prim_counters_running;
|
||||
|
||||
bool prim_generated_query_running_before_rp;
|
||||
bool has_prim_generated_query_in_rp;
|
||||
|
||||
struct tu_lrz_state lrz;
|
||||
|
||||
|
|
|
@ -932,7 +932,7 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
|
|||
uint32_t last_pass = ~0;
|
||||
|
||||
if (cmdbuf->state.pass) {
|
||||
cmdbuf->state.draw_cs_writes_to_cond_pred = true;
|
||||
cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
|
||||
}
|
||||
|
||||
/* Querying perf counters happens in these steps:
|
||||
|
@ -1023,7 +1023,7 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
|
|||
uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
|
||||
|
||||
if (cmdbuf->state.pass) {
|
||||
cmdbuf->state.has_prim_generated_query_in_rp = true;
|
||||
cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
|
||||
} else {
|
||||
cmdbuf->state.prim_generated_query_running_before_rp = true;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue