tu: Fix prim gen query and pipeline stats query interaction

Fixed:
- VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT was able to stop prim counter
  when pipeline stats query is running.
  - This may have happened when prim gen query was in secondary cmdbuf.
- VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile.
- VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile
  when pipeline stats query is started inside prim gen query and inside
  a renderpass.

The matter of VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT and pipeline stats
interaction is solved by tracking whether pipeline stats query is
running both on CPU (for non secondary cmdbuf case) and on GPU (for
secondary cmdbuf).

Note, prim gen query is not allowed with secondary command buffers, so
only pipeline stats query is tracked on gpu.
See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3142

Counting geometry per each tile is solved by:
- Conditionally executing START/STOP_PRIMITIVE_CTRS to not run in tiling
  pass. Solves the case when prim gen query is inside a renderpass.
- Stop prim counters before executing `draw_cs` and restarting them
  afterwards. Solves prim gen query being outside a renderpass.

Fixes GL CTS tests with Zink + `TU_DEBUG=gmem`:
 GTF-GL46.gtf30.GL3Tests.transform_feedback.transform_feedback_max_separate
 GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_basic
 GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_framebuffer
 GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_overflow
 GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_queried
 GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6602

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17163>
This commit is contained in:
Danylo Piliaiev 2022-06-27 19:01:08 +03:00
parent 465e7c303b
commit bf4c160909
5 changed files with 136 additions and 17 deletions

View File

@ -1382,8 +1382,17 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
/* Primitives that passed all tests are still counted in in each
* tile even with HW binning beforehand. Do not permit it.
*/
if (cmd->state.prim_generated_query_running_before_rp)
tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
tu_cs_emit_call(cs, &cmd->draw_cs);
if (cmd->state.prim_generated_query_running_before_rp)
tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
if (use_hw_binning(cmd)) {
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
@ -1747,6 +1756,9 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
} else if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
assert(pBeginInfo->pInheritanceInfo);
cmd_buffer->inherited_pipeline_statistics =
pBeginInfo->pInheritanceInfo->pipelineStatistics;
vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) {
switch (ext->sType) {
case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {

View File

@ -1868,6 +1868,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
struct tu6_global *global = device->global_bo->map;
tu_init_clear_blit_shaders(device);
global->predicate = 0;
global->vtx_stats_query_not_running = 1;
global->dbg_one = (uint32_t)-1;
global->dbg_gmem_total_loads = 0;
global->dbg_gmem_taken_loads = 0;

View File

@ -484,6 +484,8 @@ struct tu6_global
ALIGN16 uint32_t cs_indirect_xyz[3];
volatile uint32_t vtx_stats_query_not_running;
/* To know when renderpass stats for autotune are valid */
volatile uint32_t autotune_fence;
@ -1258,6 +1260,12 @@ struct tu_cmd_state
*/
uint32_t drawcall_bandwidth_per_sample_sum;
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
*/
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
bool has_prim_generated_query_in_rp;
@ -1304,6 +1312,8 @@ struct tu_cmd_buffer
VkCommandBufferUsageFlags usage_flags;
enum tu_cmd_buffer_status status;
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
struct tu_cmd_state state;
uint32_t queue_family_index;

View File

@ -426,9 +426,9 @@ statistics_index(uint32_t *statistics)
}
static bool
is_pipeline_query_with_vertex_stage(struct tu_query_pool *pool)
is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
{
return pool->pipeline_statistics &
return pipeline_statistics &
(VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
@ -441,16 +441,16 @@ is_pipeline_query_with_vertex_stage(struct tu_query_pool *pool)
}
static bool
is_pipeline_query_with_fragment_stage(struct tu_query_pool *pool)
is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
{
return pool->pipeline_statistics &
return pipeline_statistics &
VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
}
static bool
is_pipeline_query_with_compute_stage(struct tu_query_pool *pool)
is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
{
return pool->pipeline_statistics &
return pipeline_statistics &
VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
}
@ -871,15 +871,35 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);
if (is_pipeline_query_with_vertex_stage(pool)) {
if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
cmdbuf->state.prim_counters_running++;
/* Prevent starting primitive counters when it is supposed to be stopped
* for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
*/
if (need_cond_exec) {
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
CP_COND_REG_EXEC_0_SYSMEM |
CP_COND_REG_EXEC_0_BINNING);
}
tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, 0);
if (need_cond_exec) {
tu_cond_exec_end(cs);
}
}
if (is_pipeline_query_with_fragment_stage(pool)) {
if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS);
}
if (is_pipeline_query_with_compute_stage(pool)) {
if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS);
}
@ -1008,6 +1028,17 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
cmdbuf->state.prim_generated_query_running_before_rp = true;
}
cmdbuf->state.prim_counters_running++;
if (cmdbuf->state.pass) {
/* Primitives that passed all tests are still counted in in each
* tile even with HW binning beforehand. Do not permit it.
*/
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
CP_COND_REG_EXEC_0_SYSMEM |
CP_COND_REG_EXEC_0_BINNING);
}
tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
tu_cs_emit_wfi(cs);
@ -1017,6 +1048,10 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
CP_REG_TO_MEM_0_CNT(2) |
CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, begin_iova);
if (cmdbuf->state.pass) {
tu_cond_exec_end(cs);
}
}
VKAPI_ATTR void VKAPI_CALL
@ -1152,6 +1187,53 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, 0x1);
}
/* PRIMITIVE_CTRS is used for two distinct queries:
* - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
* - VK_QUERY_TYPE_PIPELINE_STATISTICS
* If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
* only for outer query.
*
* Also, pipeline stat query could run outside of renderpass and prim gen
* query inside of secondary cmd buffer - for such case we ought to track
* the status of pipeline stats query.
*/
static void
emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
struct tu_cs *cs,
enum VkQueryType query_type)
{
bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
cmdbuf->state.prim_counters_running--;
if (cmdbuf->state.prim_counters_running == 0) {
bool need_cond_exec =
is_secondary &&
query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
if (!need_cond_exec) {
tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
} else {
tu_cs_reserve(cs, 7 + 2);
/* Check that pipeline stats query is not running, only then
* we count stop the counter.
*/
tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
}
}
if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
tu_cs_emit(cs, 1);
}
}
static void
emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
@ -1164,15 +1246,19 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
uint64_t stat_start_iova;
uint64_t stat_stop_iova;
if (is_pipeline_query_with_vertex_stage(pool)) {
tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
/* No need to conditionally execute STOP_PRIMITIVE_CTRS when
* we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
* renderpass, because it is already stopped.
*/
emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
}
if (is_pipeline_query_with_fragment_stage(pool)) {
if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS);
}
if (is_pipeline_query_with_compute_stage(pool)) {
if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS);
}
@ -1355,7 +1441,11 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
uint64_t available_iova = query_available_iova(pool, query);
tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
if (cmdbuf->state.pass) {
tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
CP_COND_REG_EXEC_0_SYSMEM |
CP_COND_REG_EXEC_0_BINNING);
}
tu_cs_emit_wfi(cs);
@ -1375,6 +1465,15 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
/* Should be after waiting for mem writes to have up to date info
* about which query is running.
*/
emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
if (cmdbuf->state.pass) {
tu_cond_exec_end(cs);
}
if (cmdbuf->state.pass)
cs = &cmdbuf->draw_epilogue_cs;

View File

@ -1,6 +1,3 @@
# #6603
GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states
GTF-GL46.gtf32.GL3Tests.packed_pixels.packed_pixels_pixelstore
KHR-Single-GL46.arrays_of_arrays_gl.ConstructorsAndUnsizedDeclConstructorSizing1
dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_write_dynamic_read_vertex