diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c index ed58b9b98fac2..3562813ca8677 100644 --- a/src/freedreno/vulkan/tu_autotune.c +++ b/src/freedreno/vulkan/tu_autotune.c @@ -513,18 +513,13 @@ tu_autotune_use_bypass(struct tu_autotune *at, const struct tu_render_pass *pass = cmd_buffer->state.pass; const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer; - for (unsigned i = 0; i < pass->subpass_count; i++) { - const struct tu_subpass *subpass = &pass->subpasses[i]; - /* GMEM works much faster in this case */ - if (subpass->raster_order_attachment_access) - return false; - - /* Would be very slow in sysmem mode because we have to enable - * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) - */ - if (subpass->feedback_loop_color || subpass->feedback_loop_ds) - return false; - } + /* If a feedback loop in the subpass caused one of the pipelines used to set + * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even + * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased + * sysmem bandwidth (though we haven't quantified it). + */ + if (cmd_buffer->state.rp.sysmem_single_prim_mode) + return false; /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers * we would have to allocate GPU memory at the submit time and copy diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index dafbf73b2e9cf..8261d349de488 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -2489,6 +2489,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, */ cmd->state.rp.disable_gmem = true; } + cmd->state.rp.sysmem_single_prim_mode |= pipeline->sysmem_single_prim_mode; struct tu_cs *cs = &cmd->draw_cs; @@ -3440,6 +3441,7 @@ tu_render_pass_state_merge(struct tu_render_pass_state *dst, dst->has_tess |= src->has_tess; dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp; dst->disable_gmem |= src->disable_gmem; + dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode; dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred; dst->drawcall_count += src->drawcall_count; diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index db4aa6a8e5e1c..ddba3002da992 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -262,6 +262,7 @@ struct tu_render_pass_state bool has_tess; bool has_prim_generated_query_in_rp; bool disable_gmem; + bool sysmem_single_prim_mode; /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */ bool draw_cs_writes_to_cond_pred; diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index f26cbe1913382..af1e8374fe9de 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -3840,6 +3840,7 @@ tu_pipeline_builder_parse_rasterization_order( */ sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; gmem_prim_mode = FLUSH_PER_OVERLAP; + pipeline->sysmem_single_prim_mode = true; } else { /* If there is a feedback loop, then the shader can read the previous value * of a pixel being written out. It can also write some components and then @@ -3850,8 +3851,10 @@ tu_pipeline_builder_parse_rasterization_order( * for advanced_blend in sysmem mode if a feedback loop is detected. */ if (builder->subpass_feedback_loop_color || - builder->subpass_feedback_loop_ds) { + (builder->subpass_feedback_loop_ds && + (ds_info->depthWriteEnable || ds_info->stencilTestEnable))) { sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; + pipeline->sysmem_single_prim_mode = true; } } diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index 00c843cc332d1..537371a134ad2 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -192,6 +192,8 @@ struct tu_pipeline bool raster_order_attachment_access; bool subpass_feedback_loop_ds; bool feedback_loop_may_involve_textures; + /* If the pipeline sets SINGLE_PRIM_MODE for sysmem. */ + bool sysmem_single_prim_mode; bool z_negative_one_to_one;