vkd3d: Optimize ExecuteIndirect() if no INDIRECT transitions happened.
The D3D12 docs outline this as an implementation detail explicitly, so we should do the same thing. Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
This commit is contained in:
parent
18f1d1c72e
commit
bc759be2af
|
@ -4077,6 +4077,7 @@ static HRESULT d3d12_command_list_batch_reset_query_pools(struct d3d12_command_l
|
|||
static HRESULT d3d12_command_list_build_init_commands(struct d3d12_command_list *list)
|
||||
{
|
||||
const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs;
|
||||
VkMemoryBarrier barrier;
|
||||
VkResult vr;
|
||||
HRESULT hr;
|
||||
|
||||
|
@ -4086,6 +4087,18 @@ static HRESULT d3d12_command_list_build_init_commands(struct d3d12_command_list
|
|||
if (!list->vk_init_commands)
|
||||
return S_OK;
|
||||
|
||||
if (list->execute_indirect.has_emitted_indirect_to_compute_barrier)
|
||||
{
|
||||
/* We've patched an indirect command stream here, so do the final barrier now. */
|
||||
barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
|
||||
barrier.pNext = NULL;
|
||||
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
|
||||
barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
|
||||
VK_CALL(vkCmdPipelineBarrier(list->vk_init_commands, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
|
||||
0, 1, &barrier, 0, NULL, 0, NULL));
|
||||
}
|
||||
|
||||
if ((vr = VK_CALL(vkEndCommandBuffer(list->vk_init_commands))) < 0)
|
||||
{
|
||||
WARN("Failed to end command buffer, vr %d.\n", vr);
|
||||
|
@ -4389,6 +4402,8 @@ static void d3d12_command_list_reset_internal_state(struct d3d12_command_list *l
|
|||
list->tracked_copy_buffer_count = 0;
|
||||
|
||||
list->rendering_info.state_flags = 0;
|
||||
list->execute_indirect.has_emitted_indirect_to_compute_barrier = false;
|
||||
list->execute_indirect.has_observed_transition_to_indirect = false;
|
||||
}
|
||||
|
||||
static void d3d12_command_list_reset_state(struct d3d12_command_list *list,
|
||||
|
@ -7340,6 +7355,13 @@ static void STDMETHODCALLTYPE d3d12_command_list_ResourceBarrier(d3d12_command_l
|
|||
VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
|
||||
uint32_t dsv_decay_mask = 0;
|
||||
|
||||
/* If we have not observed any transition to INDIRECT_ARGUMENT it means
|
||||
* that in this command buffer there couldn't legally have been writes to an indirect
|
||||
* command buffer. The docs mention an implementation strategy where we can do this optimization.
|
||||
* This is very handy when handling back-to-back ExecuteIndirects(). */
|
||||
if (transition->StateAfter == D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT)
|
||||
list->execute_indirect.has_observed_transition_to_indirect = true;
|
||||
|
||||
if (!is_valid_resource_state(transition->StateBefore))
|
||||
{
|
||||
d3d12_command_list_mark_as_invalid(list,
|
||||
|
@ -9630,6 +9652,7 @@ static void d3d12_command_list_execute_indirect_state_template(
|
|||
struct vkd3d_scratch_allocation count_allocation;
|
||||
struct vkd3d_execute_indirect_args patch_args;
|
||||
VkGeneratedCommandsInfoNV generated;
|
||||
VkCommandBuffer vk_patch_cmd_buffer;
|
||||
VkIndirectCommandsStreamNV stream;
|
||||
VkDeviceSize preprocess_size;
|
||||
VkPipeline current_pipeline;
|
||||
|
@ -9648,16 +9671,6 @@ static void d3d12_command_list_execute_indirect_state_template(
|
|||
return;
|
||||
current_pipeline = list->current_pipeline;
|
||||
|
||||
/* FIXME: If we're forced to emit non-dynamic vertex strides, and the indirect state
|
||||
* wants to emit dynamic VBOs (dynamic stride), can that possibly work? Extremely unlikely to
|
||||
* actually happen in practice, but something to consider for later ... */
|
||||
|
||||
/* TODO: If we can prove that there have been no transitions to INDIRECT state,
|
||||
* we can hoist all patch jobs to the beginning of the command buffer and build a fixup
|
||||
* command buffer that batches everything. For now, take the slow path always. */
|
||||
d3d12_command_list_end_current_render_pass(list, true);
|
||||
d3d12_command_list_invalidate_current_pipeline(list, true);
|
||||
|
||||
memset(&patch_args, 0, sizeof(patch_args));
|
||||
if (FAILED(hr = d3d12_command_signature_allocate_preprocess_memory_for_list(
|
||||
list, signature, current_pipeline,
|
||||
|
@ -9695,31 +9708,51 @@ static void d3d12_command_list_execute_indirect_state_template(
|
|||
patch_args.api_buffer_word_stride = signature->desc.ByteStride / sizeof(uint32_t);
|
||||
patch_args.device_generated_commands_word_stride = signature->state_template.stride / sizeof(uint32_t);
|
||||
|
||||
VK_CALL(vkCmdPushConstants(list->vk_command_buffer, signature->state_template.pipeline.vk_pipeline_layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(patch_args), &patch_args));
|
||||
VK_CALL(vkCmdBindPipeline(list->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
signature->state_template.pipeline.vk_pipeline));
|
||||
|
||||
/* TODO: We can batch the {prologue barrier} { work } { work } ... {epilogue barrier} later. */
|
||||
/* The argument buffer and indirect count buffers are in indirect state, but we'll need to read it. */
|
||||
barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
|
||||
barrier.pNext = NULL;
|
||||
|
||||
barrier.srcAccessMask = 0;
|
||||
barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
|
||||
VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
|
||||
|
||||
if (!list->execute_indirect.has_observed_transition_to_indirect)
|
||||
{
|
||||
/* Fast path, throw the template resolve to the init command buffer. */
|
||||
d3d12_command_allocator_allocate_init_command_buffer(list->allocator, list);
|
||||
vk_patch_cmd_buffer = list->vk_init_commands;
|
||||
if (!list->execute_indirect.has_emitted_indirect_to_compute_barrier)
|
||||
{
|
||||
VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
|
||||
list->execute_indirect.has_emitted_indirect_to_compute_barrier = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
vk_patch_cmd_buffer = list->vk_command_buffer;
|
||||
d3d12_command_list_end_current_render_pass(list, true);
|
||||
VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL));
|
||||
d3d12_command_list_invalidate_current_pipeline(list, true);
|
||||
}
|
||||
|
||||
VK_CALL(vkCmdPushConstants(vk_patch_cmd_buffer, signature->state_template.pipeline.vk_pipeline_layout,
|
||||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(patch_args), &patch_args));
|
||||
VK_CALL(vkCmdBindPipeline(vk_patch_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
signature->state_template.pipeline.vk_pipeline));
|
||||
|
||||
/* One workgroup processes the patching for one draw. We could potentially use indirect dispatch
|
||||
* to restrict the patching work to just the indirect count, but meh, just more barriers.
|
||||
* We'll nop out the workgroup early based on direct count, and the number of threads should be trivial either way. */
|
||||
VK_CALL(vkCmdDispatch(list->vk_command_buffer, max_command_count, 1, 1));
|
||||
VK_CALL(vkCmdDispatch(vk_patch_cmd_buffer, max_command_count, 1, 1));
|
||||
|
||||
if (vk_patch_cmd_buffer == list->vk_command_buffer)
|
||||
{
|
||||
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
|
||||
barrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
|
||||
VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_CALL(vkCmdPipelineBarrier(vk_patch_cmd_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
|
||||
0, 1, &barrier, 0, NULL, 0, NULL));
|
||||
/* The barrier is deferred if we moved the dispatch to init command buffer. */
|
||||
}
|
||||
|
||||
if (!d3d12_command_list_begin_render_pass(list))
|
||||
{
|
||||
|
|
|
@ -2127,6 +2127,12 @@ struct d3d12_command_list
|
|||
bool is_dirty;
|
||||
} index_buffer;
|
||||
|
||||
struct
|
||||
{
|
||||
bool has_observed_transition_to_indirect;
|
||||
bool has_emitted_indirect_to_compute_barrier;
|
||||
} execute_indirect;
|
||||
|
||||
VkCommandBuffer vk_command_buffer;
|
||||
VkCommandBuffer vk_init_commands;
|
||||
|
||||
|
|
Loading…
Reference in New Issue