turnip: Implement VK_ARM_rasterization_order_attachment_access
Trivially implemented by using A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE. This extension is useful for emulators e.g. AetherSX2 PS2 emulator and could drastically improve performance when blending is emulated. Relevant tests: dEQP-VK.rasterization.rasterization_order_attachment_access.* Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15106>
This commit is contained in:
parent
d6c89e1e4a
commit
ebc23ac963
|
@ -598,6 +598,7 @@ Khronos extensions that are not part of any Vulkan version:
|
|||
VK_AMD_shader_info DONE (radv)
|
||||
VK_AMD_shader_trinary_minmax DONE (radv)
|
||||
VK_AMD_texture_gather_bias_lod DONE (radv)
|
||||
VK_ARM_rasterization_order_attachment_access DONE (tu)
|
||||
|
||||
|
||||
OpenCL 1.0 -- all DONE:
|
||||
|
|
|
@ -304,26 +304,6 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
|
|||
|
||||
unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1);
|
||||
tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1));
|
||||
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
|
||||
A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
|
||||
|
||||
/* If there is a feedback loop, then the shader can read the previous value
|
||||
* of a pixel being written out. It can also write some components and then
|
||||
* read different components without a barrier in between. This is a
|
||||
* problem in sysmem mode with UBWC, because the main buffer and flags
|
||||
* buffer can get out-of-sync if only one is flushed. We fix this by
|
||||
* setting the SINGLE_PRIM_MODE field to the same value that the blob does
|
||||
* for advanced_blend in sysmem mode if a feedback loop is detected.
|
||||
*/
|
||||
if (subpass->feedback_loop_color || subpass->feedback_loop_ds) {
|
||||
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
|
||||
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
|
||||
A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
|
||||
A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(
|
||||
FLUSH_PER_OVERLAP_AND_OVERWRITE));
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -535,9 +515,11 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
|
|||
enable_mask = CP_SET_DRAW_STATE__0_BINNING;
|
||||
break;
|
||||
case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
|
||||
case TU_DRAW_STATE_PRIM_MODE_GMEM:
|
||||
enable_mask = CP_SET_DRAW_STATE__0_GMEM;
|
||||
break;
|
||||
case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
|
||||
case TU_DRAW_STATE_PRIM_MODE_SYSMEM:
|
||||
enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
|
||||
break;
|
||||
default:
|
||||
|
@ -2329,7 +2311,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
|
|||
if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
|
||||
uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
|
||||
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (9 + util_bitcount(mask)));
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
|
||||
|
@ -2337,6 +2319,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
|
|||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem);
|
||||
|
||||
u_foreach_bit(i, mask)
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
|
||||
|
@ -4000,6 +3984,8 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
|
|||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
|
||||
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
|
||||
|
|
|
@ -205,6 +205,7 @@ get_device_extensions(const struct tu_physical_device *device,
|
|||
.EXT_image_robustness = true,
|
||||
/* For Graphics Flight Recorder (GFR) */
|
||||
.AMD_buffer_marker = true,
|
||||
.ARM_rasterization_order_attachment_access = true,
|
||||
#ifdef ANDROID
|
||||
.ANDROID_native_buffer = true,
|
||||
#endif
|
||||
|
@ -810,6 +811,14 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
|
|||
features->primitiveTopologyPatchListRestart = false;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_ARM: {
|
||||
VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesARM *features =
|
||||
(VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesARM *)ext;
|
||||
features->rasterizationOrderColorAttachmentAccess = true;
|
||||
features->rasterizationOrderDepthAttachmentAccess = true;
|
||||
features->rasterizationOrderStencilAttachmentAccess = true;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
|
|
|
@ -750,6 +750,13 @@ tu_CreateRenderPass2(VkDevice _device,
|
|||
subpass->samples = 0;
|
||||
subpass->srgb_cntl = 0;
|
||||
|
||||
const VkSubpassDescriptionFlagBits raster_order_access_bits =
|
||||
VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_ARM |
|
||||
VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM |
|
||||
VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM;
|
||||
|
||||
subpass->raster_order_attachment_access = desc->flags & raster_order_access_bits;
|
||||
|
||||
subpass->multiview_mask = desc->viewMask;
|
||||
|
||||
if (desc->inputAttachmentCount > 0) {
|
||||
|
|
|
@ -274,6 +274,8 @@ struct tu_pipeline_builder
|
|||
uint32_t render_components;
|
||||
uint32_t multiview_mask;
|
||||
|
||||
bool subpass_raster_order_attachment_access;
|
||||
bool subpass_feedback_loop_color;
|
||||
bool subpass_feedback_loop_ds;
|
||||
};
|
||||
|
||||
|
@ -3151,6 +3153,78 @@ tu_pipeline_builder_parse_multisample_and_color_blend(
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
tu_pipeline_builder_parse_rasterization_order(
|
||||
struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
|
||||
{
|
||||
if (builder->rasterizer_discard)
|
||||
return;
|
||||
|
||||
pipeline->subpass_feedback_loop_ds = builder->subpass_feedback_loop_ds;
|
||||
|
||||
const VkPipelineColorBlendStateCreateInfo *blend_info =
|
||||
builder->create_info->pColorBlendState;
|
||||
|
||||
const VkPipelineDepthStencilStateCreateInfo *ds_info =
|
||||
builder->create_info->pDepthStencilState;
|
||||
|
||||
if (builder->use_color_attachments) {
|
||||
pipeline->raster_order_attachment_access =
|
||||
blend_info->flags &
|
||||
VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM;
|
||||
}
|
||||
|
||||
if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
|
||||
pipeline->raster_order_attachment_access |=
|
||||
ds_info->flags &
|
||||
(VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM |
|
||||
VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM);
|
||||
}
|
||||
|
||||
/* VK_EXT_blend_operation_advanced would also require ordered access
|
||||
* when implemented in the future.
|
||||
*/
|
||||
|
||||
uint32_t sysmem_prim_mode = NO_FLUSH;
|
||||
uint32_t gmem_prim_mode = NO_FLUSH;
|
||||
|
||||
if (pipeline->raster_order_attachment_access) {
|
||||
/* VK_ARM_rasterization_order_attachment_access:
|
||||
*
|
||||
* This extension allow access to framebuffer attachments when used as
|
||||
* both input and color attachments from one fragment to the next,
|
||||
* in rasterization order, without explicit synchronization.
|
||||
*/
|
||||
sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
|
||||
gmem_prim_mode = FLUSH_PER_OVERLAP;
|
||||
} else {
|
||||
/* If there is a feedback loop, then the shader can read the previous value
|
||||
* of a pixel being written out. It can also write some components and then
|
||||
* read different components without a barrier in between. This is a
|
||||
* problem in sysmem mode with UBWC, because the main buffer and flags
|
||||
* buffer can get out-of-sync if only one is flushed. We fix this by
|
||||
* setting the SINGLE_PRIM_MODE field to the same value that the blob does
|
||||
* for advanced_blend in sysmem mode if a feedback loop is detected.
|
||||
*/
|
||||
if (builder->subpass_feedback_loop_color ||
|
||||
builder->subpass_feedback_loop_ds) {
|
||||
sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
|
||||
}
|
||||
}
|
||||
|
||||
struct tu_cs cs;
|
||||
|
||||
pipeline->prim_order_state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
|
||||
tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
|
||||
A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
|
||||
A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
|
||||
|
||||
pipeline->prim_order_state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
|
||||
tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
|
||||
A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
|
||||
A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode));
|
||||
}
|
||||
|
||||
static void
|
||||
tu_pipeline_finish(struct tu_pipeline *pipeline,
|
||||
struct tu_device *dev,
|
||||
|
@ -3176,7 +3250,6 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
|
|||
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
|
||||
(*pipeline)->layout = builder->layout;
|
||||
(*pipeline)->subpass_feedback_loop_ds = builder->subpass_feedback_loop_ds;
|
||||
(*pipeline)->executables_mem_ctx = ralloc_context(NULL);
|
||||
util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
|
||||
|
||||
|
@ -3236,6 +3309,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
|
|||
tu_pipeline_builder_parse_rasterization(builder, *pipeline);
|
||||
tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
|
||||
tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
|
||||
tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
|
||||
tu6_emit_load_state(*pipeline, false);
|
||||
|
||||
/* we should have reserved enough space upfront such that the CS never
|
||||
|
@ -3290,6 +3364,9 @@ tu_pipeline_builder_init_graphics(
|
|||
const struct tu_subpass *subpass =
|
||||
&pass->subpasses[create_info->subpass];
|
||||
|
||||
builder->subpass_raster_order_attachment_access =
|
||||
subpass->raster_order_attachment_access;
|
||||
builder->subpass_feedback_loop_color = subpass->feedback_loop_color;
|
||||
builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds;
|
||||
|
||||
builder->multiview_mask = subpass->multiview_mask;
|
||||
|
|
|
@ -616,6 +616,8 @@ enum tu_draw_state_group_id
|
|||
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
|
||||
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
|
||||
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
|
||||
TU_DRAW_STATE_PRIM_MODE_GMEM,
|
||||
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
|
||||
|
||||
/* dynamic state related draw states */
|
||||
TU_DRAW_STATE_DYNAMIC,
|
||||
|
@ -1317,6 +1319,7 @@ struct tu_pipeline
|
|||
|
||||
/* draw states for the pipeline */
|
||||
struct tu_draw_state load_state, rast_state, blend_state;
|
||||
struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
|
||||
|
||||
/* for vertex buffers state */
|
||||
uint32_t num_vbs;
|
||||
|
@ -1359,6 +1362,8 @@ struct tu_pipeline
|
|||
|
||||
struct tu_lrz_pipeline lrz;
|
||||
|
||||
/* In other words - framebuffer fetch support */
|
||||
bool raster_order_attachment_access;
|
||||
bool subpass_feedback_loop_ds;
|
||||
|
||||
/* Base drawcall cost for sysmem vs gmem autotuner */
|
||||
|
@ -1701,6 +1706,9 @@ struct tu_subpass
|
|||
/* True if we must invalidate UCHE thanks to a feedback loop. */
|
||||
bool feedback_invalidate;
|
||||
|
||||
/* In other words - framebuffer fetch support */
|
||||
bool raster_order_attachment_access;
|
||||
|
||||
struct tu_subpass_attachment *input_attachments;
|
||||
struct tu_subpass_attachment *color_attachments;
|
||||
struct tu_subpass_attachment *resolve_attachments;
|
||||
|
|
Loading…
Reference in New Issue