From 59b75b5b1d60ad2686ee97bca02781ef6a4f0e85 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Tue, 23 Nov 2021 14:45:16 +0100 Subject: [PATCH] vkd3d: Implement some advanced use cases of ExecuteIndirect. Signed-off-by: Hans-Kristian Arntzen --- libs/vkd3d/command.c | 603 ++++++++++++++++++++++++++++++++++++- libs/vkd3d/vkd3d_private.h | 12 + 2 files changed, 602 insertions(+), 13 deletions(-) diff --git a/libs/vkd3d/command.c b/libs/vkd3d/command.c index 93964b76..fc4ce2dd 100644 --- a/libs/vkd3d/command.c +++ b/libs/vkd3d/command.c @@ -9408,6 +9408,218 @@ STATIC_ASSERT(sizeof(VkDrawIndexedIndirectCommand) == sizeof(D3D12_DRAW_INDEXED_ STATIC_ASSERT(sizeof(VkDrawIndirectCommand) == sizeof(D3D12_DRAW_ARGUMENTS)); STATIC_ASSERT(offsetof(VkTraceRaysIndirectCommand2KHR, depth) == offsetof(D3D12_DISPATCH_RAYS_DESC, Depth)); +static HRESULT d3d12_command_signature_allocate_stream_memory_for_list( + struct d3d12_command_list *list, + struct d3d12_command_signature *signature, + uint32_t max_command_count, + struct vkd3d_scratch_allocation *allocation); + +static HRESULT d3d12_command_signature_allocate_preprocess_memory_for_list( + struct d3d12_command_list *list, + struct d3d12_command_signature *signature, VkPipeline render_pipeline, + uint32_t max_command_count, + struct vkd3d_scratch_allocation *allocation, VkDeviceSize *size); + +static void d3d12_command_list_execute_indirect_state_template( + struct d3d12_command_list *list, struct d3d12_command_signature *signature, + uint32_t max_command_count, + struct d3d12_resource *arg_buffer, UINT64 arg_buffer_offset, + struct d3d12_resource *count_buffer, UINT64 count_buffer_offset) +{ + const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs; + struct vkd3d_scratch_allocation preprocess_allocation; + struct vkd3d_scratch_allocation stream_allocation; + struct vkd3d_scratch_allocation count_allocation; + struct vkd3d_execute_indirect_args patch_args; + VkGeneratedCommandsInfoNV generated; + VkIndirectCommandsStreamNV stream; + VkDeviceSize preprocess_size; + VkPipeline current_pipeline; + VkMemoryBarrier barrier; + unsigned int i; + HRESULT hr; + + /* To build device generated commands, we need to know the pipeline we're going to render with. */ + if (!d3d12_command_list_update_graphics_pipeline(list)) + return; + current_pipeline = list->current_pipeline; + + /* FIXME: If we're forced to emit non-dynamic vertex strides, and the indirect state + * wants to emit dynamic VBOs (dynamic stride), can that possibly work? Extremely unlikely to + * actually happen in practice, but something to consider for later ... */ + + /* TODO: If we can prove that there have been no transitions to INDIRECT state, + * we can hoist all patch jobs to the beginning of the command buffer and build a fixup + * command buffer that batches everything. For now, take the slow path always. */ + d3d12_command_list_end_current_render_pass(list, true); + d3d12_command_list_invalidate_current_pipeline(list, true); + + memset(&patch_args, 0, sizeof(patch_args)); + if (FAILED(hr = d3d12_command_signature_allocate_preprocess_memory_for_list( + list, signature, current_pipeline, + max_command_count, &preprocess_allocation, &preprocess_size))) + { + WARN("Failed to allocate preprocess memory.\n"); + return; + } + + if (FAILED(hr = d3d12_command_signature_allocate_stream_memory_for_list( + list, signature, max_command_count, &stream_allocation))) + { + WARN("Failed to allocate stream memory.\n"); + return; + } + + if (count_buffer) + { + if (FAILED(hr = d3d12_command_allocator_allocate_scratch_memory(list->allocator, + sizeof(uint32_t), + list->device->device_info.device_generated_commands_properties_nv.minSequencesCountBufferOffsetAlignment, + &count_allocation))) + { + WARN("Failed to allocate count memory.\n"); + return; + } + } + + patch_args.template_va = signature->state_template.buffer_va; + patch_args.api_buffer_va = d3d12_resource_get_va(arg_buffer, arg_buffer_offset); + patch_args.device_generated_commands_va = stream_allocation.va; + patch_args.indirect_count_va = count_buffer ? d3d12_resource_get_va(count_buffer, count_buffer_offset) : 0; + patch_args.dst_indirect_count_va = count_buffer ? count_allocation.va : 0; + patch_args.api_buffer_word_stride = signature->desc.ByteStride / sizeof(uint32_t); + patch_args.device_generated_commands_word_stride = signature->state_template.stride / sizeof(uint32_t); + + VK_CALL(vkCmdPushConstants(list->vk_command_buffer, signature->state_template.pipeline.vk_pipeline_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(patch_args), &patch_args)); + VK_CALL(vkCmdBindPipeline(list->vk_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, + signature->state_template.pipeline.vk_pipeline)); + + /* TODO: We can batch the {prologue barrier} { work } { work } ... {epilogue barrier} later. */ + /* The argument buffer and indirect count buffers are in indirect state, but we'll need to read it. */ + barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; + barrier.pNext = NULL; + + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier, 0, NULL, 0, NULL)); + + /* One workgroup processes the patching for one draw. We could potentially use indirect dispatch + * to restrict the patching work to just the indirect count, but meh, just more barriers. + * We'll nop out the workgroup early based on direct count, and the number of threads should be trivial either way. */ + VK_CALL(vkCmdDispatch(list->vk_command_buffer, max_command_count, 1, 1)); + + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_COMMAND_PREPROCESS_READ_BIT_NV | VK_ACCESS_INDIRECT_COMMAND_READ_BIT; + VK_CALL(vkCmdPipelineBarrier(list->vk_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMMAND_PREPROCESS_BIT_NV, + 0, 1, &barrier, 0, NULL, 0, NULL)); + + if (!d3d12_command_list_begin_render_pass(list)) + { + WARN("Failed to begin render pass, ignoring draw.\n"); + return; + } + + /* Bind IBO. If we always update the IBO indirectly, do not validate the index buffer here. + * We can render fine even with a NULL IBO bound. */ + for (i = 0; i < signature->desc.NumArgumentDescs; i++) + if (signature->desc.pArgumentDescs[i].Type == D3D12_INDIRECT_ARGUMENT_TYPE_INDEX_BUFFER_VIEW) + break; + + if (i == signature->desc.NumArgumentDescs && + signature->desc.pArgumentDescs[signature->desc.NumArgumentDescs - 1].Type == + D3D12_INDIRECT_ARGUMENT_TYPE_DRAW_INDEXED && + !d3d12_command_list_update_index_buffer(list)) + { + return; + } + + generated.sType = VK_STRUCTURE_TYPE_GENERATED_COMMANDS_INFO_NV; + generated.pNext = NULL; + generated.pipeline = list->current_pipeline; + generated.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + generated.indirectCommandsLayout = signature->state_template.layout; + generated.streamCount = 1; + generated.pStreams = &stream; + generated.preprocessBuffer = preprocess_allocation.buffer; + generated.preprocessOffset = preprocess_allocation.offset; + generated.preprocessSize = preprocess_size; + generated.sequencesCount = max_command_count; + generated.sequencesIndexBuffer = VK_NULL_HANDLE; + generated.sequencesIndexOffset = 0; + + if (count_buffer) + { + generated.sequencesCountBuffer = count_allocation.buffer; + generated.sequencesCountOffset = count_allocation.offset; + } + else + { + generated.sequencesCountBuffer = VK_NULL_HANDLE; + generated.sequencesCountOffset = 0; + } + + stream.buffer = stream_allocation.buffer; + stream.offset = stream_allocation.offset; + + VK_CALL(vkCmdExecuteGeneratedCommandsNV(list->vk_command_buffer, VK_FALSE, &generated)); + + /* Need to clear state to zero if it was part of a command signature. */ + for (i = 0; i < signature->desc.NumArgumentDescs; i++) + { + const D3D12_INDIRECT_ARGUMENT_DESC *arg = &signature->desc.pArgumentDescs[i]; + switch (arg->Type) + { + case D3D12_INDIRECT_ARGUMENT_TYPE_INDEX_BUFFER_VIEW: + /* Null IBO */ + list->index_buffer.is_non_null = false; + break; + + case D3D12_INDIRECT_ARGUMENT_TYPE_VERTEX_BUFFER_VIEW: + { + /* Null VBO */ + uint32_t slot = arg->VertexBuffer.Slot; + list->dynamic_state.vertex_buffers[slot] = VK_NULL_HANDLE; + list->dynamic_state.vertex_strides[slot] = 0; + list->dynamic_state.vertex_offsets[slot] = 0; + list->dynamic_state.vertex_sizes[slot] = 0; + break; + } + + case D3D12_INDIRECT_ARGUMENT_TYPE_CONSTANT_BUFFER_VIEW: + case D3D12_INDIRECT_ARGUMENT_TYPE_SHADER_RESOURCE_VIEW: + case D3D12_INDIRECT_ARGUMENT_TYPE_UNORDERED_ACCESS_VIEW: + { + uint32_t index = arg->ConstantBufferView.RootParameterIndex; + d3d12_command_list_set_root_descriptor(list, + VK_PIPELINE_BIND_POINT_GRAPHICS, index, 0); + break; + } + + case D3D12_INDIRECT_ARGUMENT_TYPE_CONSTANT: + { + uint32_t zeroes[D3D12_MAX_ROOT_COST]; + memset(zeroes, 0, sizeof(uint32_t) * arg->Constant.Num32BitValuesToSet); + d3d12_command_list_set_root_constants(list, + VK_PIPELINE_BIND_POINT_GRAPHICS, arg->Constant.RootParameterIndex, + arg->Constant.DestOffsetIn32BitValues, + arg->Constant.Num32BitValuesToSet, zeroes); + break; + } + + default: + break; + } + } + + /* Spec mentions that all state related to the bind point is undefined after this, so + * invalidate all state. Unclear exactly which state is invalidated though ... + * Treat it as a meta shader. We need to nuke all state after running execute generated commands. */ + d3d12_command_list_invalidate_all_state(list); +} + static void STDMETHODCALLTYPE d3d12_command_list_ExecuteIndirect(d3d12_command_list_iface *iface, ID3D12CommandSignature *command_signature, UINT max_command_count, ID3D12Resource *arg_buffer, UINT64 arg_buffer_offset, ID3D12Resource *count_buffer, UINT64 count_buffer_offset) @@ -9426,12 +9638,27 @@ static void STDMETHODCALLTYPE d3d12_command_list_ExecuteIndirect(d3d12_command_l iface, command_signature, max_command_count, arg_buffer, arg_buffer_offset, count_buffer, count_buffer_offset); + if (!max_command_count) + return; + if ((count_buffer || list->predicate_va) && !list->device->vk_info.KHR_draw_indirect_count) { FIXME("Count buffers not supported by Vulkan implementation.\n"); return; } + if (sig_impl->requires_state_template) + { + /* Complex execute indirect path. */ + if (list->predicate_va) + FIXME("Predicated ExecuteIndirect with state template not supported yet. Ignoring predicate.\n"); + d3d12_command_list_execute_indirect_state_template(list, sig_impl, + max_command_count, + arg_impl, arg_buffer_offset, + count_impl, count_buffer_offset); + return; + } + /* Temporary workaround, since we cannot parse non-draw arguments yet. Point directly * to the first argument. Should avoid hard crashes for now. */ arg_buffer_offset += sig_impl->argument_buffer_offset; @@ -12186,6 +12413,22 @@ static ULONG STDMETHODCALLTYPE d3d12_command_signature_AddRef(ID3D12CommandSigna return refcount; } +static void d3d12_command_signature_cleanup(struct d3d12_command_signature *signature) +{ + const struct vkd3d_vk_device_procs *vk_procs = &signature->device->vk_procs; + + if (signature->device->device_info.device_generated_commands_features_nv.deviceGeneratedCommands) + { + VK_CALL(vkDestroyBuffer(signature->device->vk_device, signature->state_template.buffer, NULL)); + vkd3d_free_device_memory(signature->device, &signature->state_template.memory); + VK_CALL(vkDestroyIndirectCommandsLayoutNV(signature->device->vk_device, signature->state_template.layout, NULL)); + } + + vkd3d_private_store_destroy(&signature->private_store); + vkd3d_free((void *)signature->desc.pArgumentDescs); + vkd3d_free(signature); +} + static ULONG STDMETHODCALLTYPE d3d12_command_signature_Release(ID3D12CommandSignature *iface) { struct d3d12_command_signature *signature = impl_from_ID3D12CommandSignature(iface); @@ -12196,12 +12439,7 @@ static ULONG STDMETHODCALLTYPE d3d12_command_signature_Release(ID3D12CommandSign if (!refcount) { struct d3d12_device *device = signature->device; - - vkd3d_private_store_destroy(&signature->private_store); - - vkd3d_free((void *)signature->desc.pArgumentDescs); - vkd3d_free(signature); - + d3d12_command_signature_cleanup(signature); d3d12_device_release(device); } @@ -12264,12 +12502,341 @@ CONST_VTBL struct ID3D12CommandSignatureVtbl d3d12_command_signature_vtbl = d3d12_command_signature_GetDevice, }; +enum vkd3d_patch_command_token +{ + VKD3D_PATCH_COMMAND_TOKEN_COPY_U32 = 0, + VKD3D_PATCH_COMMAND_TOKEN_COPY_INDEX_FORMAT = 1, + VKD3D_PATCH_COMMAND_INT_MAX = 0x7fffffff +}; + +struct vkd3d_patch_command +{ + enum vkd3d_patch_command_token token; + uint32_t src_offset; + uint32_t dst_offset; +}; + +static HRESULT d3d12_command_signature_init_patch_commands_buffer(struct d3d12_command_signature *signature, + struct d3d12_device *device, + const struct vkd3d_patch_command *commands, size_t command_count) +{ + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + D3D12_RESOURCE_DESC1 buffer_desc; + D3D12_HEAP_PROPERTIES heap_info; + HRESULT hr = S_OK; + VkResult vr; + void *ptr; + + memset(&heap_info, 0, sizeof(heap_info)); + heap_info.Type = D3D12_HEAP_TYPE_UPLOAD; + memset(&buffer_desc, 0, sizeof(buffer_desc)); + buffer_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + buffer_desc.Width = command_count * sizeof(struct vkd3d_patch_command); + buffer_desc.Height = 1; + buffer_desc.DepthOrArraySize = 1; + buffer_desc.SampleDesc.Count = 1; + buffer_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + buffer_desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; + + if (FAILED(hr = vkd3d_create_buffer(device, &heap_info, D3D12_HEAP_FLAG_NONE, + &buffer_desc, &signature->state_template.buffer))) + return hr; + + if (FAILED(hr = vkd3d_allocate_buffer_memory(device, signature->state_template.buffer, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + &signature->state_template.memory))) + return hr; + + signature->state_template.buffer_va = vkd3d_get_buffer_device_address(device, + signature->state_template.buffer); + + if ((vr = VK_CALL(vkMapMemory(device->vk_device, signature->state_template.memory.vk_memory, + 0, VK_WHOLE_SIZE, 0, (void**)&ptr)))) + return hr; + + memcpy(ptr, commands, command_count * sizeof(struct vkd3d_patch_command)); + VK_CALL(vkUnmapMemory(device->vk_device, signature->state_template.memory.vk_memory)); + + return hr; +} + +static HRESULT d3d12_command_signature_init_indirect_commands_layout( + struct d3d12_command_signature *signature, struct d3d12_device *device, + const VkIndirectCommandsLayoutTokenNV *tokens, uint32_t token_count, + uint32_t stream_stride) +{ + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + VkIndirectCommandsLayoutCreateInfoNV create_info; + VkResult vr; + + create_info.sType = VK_STRUCTURE_TYPE_INDIRECT_COMMANDS_LAYOUT_CREATE_INFO_NV; + create_info.pNext = NULL; + create_info.flags = 0; + create_info.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + create_info.streamCount = 1; + create_info.pStreamStrides = &stream_stride; + create_info.tokenCount = token_count; + create_info.pTokens = tokens; + + signature->state_template.stride = stream_stride; + + if (token_count > device->device_info.device_generated_commands_properties_nv.maxIndirectCommandsTokenCount) + { + FIXME("Token count %u is too large (max %u).\n", + token_count, device->device_info.device_generated_commands_properties_nv.maxIndirectCommandsTokenCount); + return E_NOTIMPL; + } + + vr = VK_CALL(vkCreateIndirectCommandsLayoutNV(device->vk_device, &create_info, NULL, &signature->state_template.layout)); + return hresult_from_vk_result(vr); +} + +static HRESULT d3d12_command_signature_allocate_stream_memory_for_list( + struct d3d12_command_list *list, + struct d3d12_command_signature *signature, + uint32_t max_command_count, + struct vkd3d_scratch_allocation *allocation) +{ + if (!d3d12_command_allocator_allocate_scratch_memory(list->allocator, + max_command_count * signature->state_template.stride, + list->device->device_info.device_generated_commands_properties_nv.minIndirectCommandsBufferOffsetAlignment, + allocation)) + return E_OUTOFMEMORY; + + return S_OK; +} + +static HRESULT d3d12_command_signature_allocate_preprocess_memory_for_list( + struct d3d12_command_list *list, + struct d3d12_command_signature *signature, VkPipeline render_pipeline, + uint32_t max_command_count, + struct vkd3d_scratch_allocation *allocation, VkDeviceSize *size) +{ + const struct vkd3d_vk_device_procs *vk_procs = &list->device->vk_procs; + VkGeneratedCommandsMemoryRequirementsInfoNV info; + VkMemoryRequirements2 memory_info; + + memory_info.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2; + memory_info.pNext = NULL; + + info.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + info.sType = VK_STRUCTURE_TYPE_GENERATED_COMMANDS_MEMORY_REQUIREMENTS_INFO_NV; + info.pNext = NULL; + info.maxSequencesCount = max_command_count; + info.pipeline = render_pipeline; + info.indirectCommandsLayout = signature->state_template.layout; + + if (max_command_count > list->device->device_info.device_generated_commands_properties_nv.maxIndirectSequenceCount) + { + FIXME("max_command_count %u exceeds device limit %u.\n", + max_command_count, + list->device->device_info.device_generated_commands_properties_nv.maxIndirectSequenceCount); + return E_NOTIMPL; + } + + VK_CALL(vkGetGeneratedCommandsMemoryRequirementsNV(list->device->vk_device, &info, &memory_info)); + + if (!d3d12_command_allocator_allocate_scratch_memory(list->allocator, memory_info.memoryRequirements.size, + list->device->device_info.device_generated_commands_properties_nv.minIndirectCommandsBufferOffsetAlignment, + allocation)) + return E_OUTOFMEMORY; + + /* Going to assume the memory type is okay ... It's device local after all. */ + *size = memory_info.memoryRequirements.size; + return S_OK; +} + +static HRESULT d3d12_command_signature_init_state_template(struct d3d12_command_signature *signature, + const D3D12_COMMAND_SIGNATURE_DESC *desc, + struct d3d12_root_signature *root_signature, + struct d3d12_device *device) +{ + const struct vkd3d_shader_root_parameter *root_parameter; + const struct vkd3d_shader_root_constant *root_constant; + struct vkd3d_patch_command *patch_commands = NULL; + VkIndirectCommandsLayoutTokenNV *tokens = NULL; + VkIndirectCommandsLayoutTokenNV token; + uint32_t generic_u32_copy_count; + size_t patch_commands_count = 0; + size_t patch_commands_size = 0; + uint32_t root_parameter_index; + uint32_t src_word_offset = 0; + uint32_t stream_stride = 0; + uint32_t dst_word_offset; + size_t token_count = 0; + size_t token_size = 0; + HRESULT hr = S_OK; + uint32_t i, j; + + if (!device->device_info.device_generated_commands_features_nv.deviceGeneratedCommands) + { + WARN("Device generated commands not supported, indirect state commands will be ignored.\n"); + return S_OK; + } + + for (i = 0; i < desc->NumArgumentDescs; i++) + { + const D3D12_INDIRECT_ARGUMENT_DESC *argument_desc = &desc->pArgumentDescs[i]; + memset(&token, 0, sizeof(token)); + token.sType = VK_STRUCTURE_TYPE_INDIRECT_COMMANDS_LAYOUT_TOKEN_NV; + generic_u32_copy_count = 0; + dst_word_offset = 0; + + switch (argument_desc->Type) + { + case D3D12_INDIRECT_ARGUMENT_TYPE_CONSTANT: + root_parameter_index = argument_desc->Constant.RootParameterIndex; + root_constant = root_signature_get_32bit_constants(root_signature, root_parameter_index); + token.tokenType = VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV; + token.pushconstantPipelineLayout = root_signature->graphics.vk_pipeline_layout; + token.pushconstantShaderStageFlags = root_signature->graphics.vk_push_stages; + token.pushconstantOffset = root_constant->constant_index + argument_desc->Constant.DestOffsetIn32BitValues; + token.pushconstantSize = argument_desc->Constant.Num32BitValuesToSet; + token.pushconstantOffset *= sizeof(uint32_t); + token.pushconstantSize *= sizeof(uint32_t); + + stream_stride = align(stream_stride, sizeof(uint32_t)); + token.offset = stream_stride; + stream_stride += token.pushconstantSize; + dst_word_offset = token.offset / sizeof(uint32_t); + + generic_u32_copy_count = argument_desc->Constant.Num32BitValuesToSet; + break; + + case D3D12_INDIRECT_ARGUMENT_TYPE_UNORDERED_ACCESS_VIEW: + case D3D12_INDIRECT_ARGUMENT_TYPE_SHADER_RESOURCE_VIEW: + case D3D12_INDIRECT_ARGUMENT_TYPE_CONSTANT_BUFFER_VIEW: + root_parameter_index = argument_desc->ShaderResourceView.RootParameterIndex; + root_parameter = root_signature_get_parameter(root_signature, root_parameter_index); + if (!(root_signature->root_descriptor_raw_va_mask & (1ull << root_parameter_index))) + { + ERR("Root parameter %u is not a raw VA. Cannot implement command signature which updates root descriptor.\n", + root_parameter_index); + hr = E_NOTIMPL; + goto end; + } + + token.tokenType = VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV; + token.pushconstantPipelineLayout = root_signature->graphics.vk_pipeline_layout; + token.pushconstantShaderStageFlags = root_signature->graphics.vk_push_stages; + token.pushconstantOffset = root_parameter->descriptor.raw_va_root_descriptor_index * sizeof(VkDeviceAddress); + token.pushconstantSize = sizeof(VkDeviceAddress); + + stream_stride = align(stream_stride, sizeof(uint32_t)); + token.offset = stream_stride; + stream_stride += token.pushconstantSize; + dst_word_offset = token.offset / sizeof(uint32_t); + + /* Simply patch by copying U32s. Need to handle unaligned U32s since everything is tightly packed. */ + generic_u32_copy_count = sizeof(VkDeviceAddress) / sizeof(uint32_t); + break; + + case D3D12_INDIRECT_ARGUMENT_TYPE_VERTEX_BUFFER_VIEW: + token.tokenType = VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV; + token.vertexBindingUnit = argument_desc->VertexBuffer.Slot; + token.vertexDynamicStride = VK_TRUE; + + stream_stride = align(stream_stride, sizeof(VkDeviceAddress)); + token.offset = stream_stride; + stream_stride += sizeof(VkBindVertexBufferIndirectCommandNV); + dst_word_offset = token.offset / sizeof(uint32_t); + + /* The VBV indirect layout is the same as DX, so just copy the U32s. */ + generic_u32_copy_count = sizeof(D3D12_VERTEX_BUFFER_VIEW) / sizeof(uint32_t); + break; + + case D3D12_INDIRECT_ARGUMENT_TYPE_INDEX_BUFFER_VIEW: + token.tokenType = VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV; + stream_stride = align(stream_stride, sizeof(VkDeviceAddress)); + token.offset = stream_stride; + stream_stride += sizeof(VkBindVertexBufferIndirectCommandNV); + dst_word_offset = token.offset / sizeof(uint32_t); + + /* Need to convert the index buffer type. Could maybe use the pIndexTypes LUT feature, + * but we'd need a pretty chonkers LUT to make that work ... */ + vkd3d_array_reserve((void**)&patch_commands, &patch_commands_size, + patch_commands_count + sizeof(D3D12_INDEX_BUFFER_VIEW) / sizeof(uint32_t), + sizeof(*patch_commands)); + + for (j = 0; j < 3; j++) + { + patch_commands[patch_commands_count].token = VKD3D_PATCH_COMMAND_TOKEN_COPY_U32; + patch_commands[patch_commands_count].src_offset = src_word_offset++; + patch_commands[patch_commands_count].dst_offset = dst_word_offset++; + patch_commands_count++; + } + + patch_commands[patch_commands_count].token = VKD3D_PATCH_COMMAND_TOKEN_COPY_INDEX_FORMAT; + patch_commands[patch_commands_count].src_offset = src_word_offset++; + patch_commands[patch_commands_count].dst_offset = dst_word_offset++; + patch_commands_count++; + break; + + case D3D12_INDIRECT_ARGUMENT_TYPE_DRAW: + token.tokenType = VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV; + stream_stride = align(stream_stride, sizeof(uint32_t)); + token.offset = stream_stride; + stream_stride += sizeof(VkDrawIndirectCommand); + dst_word_offset = token.offset / sizeof(uint32_t); + generic_u32_copy_count = sizeof(VkDrawIndirectCommand) / sizeof(uint32_t); + break; + + case D3D12_INDIRECT_ARGUMENT_TYPE_DRAW_INDEXED: + token.tokenType = VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV; + stream_stride = align(stream_stride, sizeof(uint32_t)); + token.offset = stream_stride; + stream_stride += sizeof(VkDrawIndexedIndirectCommand); + dst_word_offset = token.offset / sizeof(uint32_t); + generic_u32_copy_count = sizeof(VkDrawIndexedIndirectCommand) / sizeof(uint32_t); + break; + + default: + FIXME("Unsupported token type %u.\n", argument_desc->Type); + hr = E_NOTIMPL; + goto end; + } + + vkd3d_array_reserve((void**)&tokens, &token_size, token_count + 1, sizeof(*tokens)); + tokens[token_count++] = token; + + if (generic_u32_copy_count) + { + vkd3d_array_reserve((void**)&patch_commands, &patch_commands_size, + patch_commands_count + generic_u32_copy_count, + sizeof(*patch_commands)); + + /* Simply patch by copying U32s. */ + for (j = 0; j < generic_u32_copy_count; j++, patch_commands_count++) + { + patch_commands[patch_commands_count].token = VKD3D_PATCH_COMMAND_TOKEN_COPY_U32; + patch_commands[patch_commands_count].src_offset = src_word_offset++; + patch_commands[patch_commands_count].dst_offset = dst_word_offset++; + } + } + } + + stream_stride = align(stream_stride, sizeof(VkDeviceAddress)); + + if (FAILED(hr = d3d12_command_signature_init_patch_commands_buffer(signature, device, patch_commands, patch_commands_count))) + goto end; + if (FAILED(hr = d3d12_command_signature_init_indirect_commands_layout(signature, device, tokens, token_count, stream_stride))) + goto end; + if (FAILED(hr = vkd3d_meta_get_execute_indirect_pipeline(&device->meta_ops, patch_commands_count, &signature->state_template.pipeline))) + goto end; + +end: + vkd3d_free(tokens); + vkd3d_free(patch_commands); + return hr; +} + HRESULT d3d12_command_signature_create(struct d3d12_device *device, struct d3d12_root_signature *root_signature, const D3D12_COMMAND_SIGNATURE_DESC *desc, struct d3d12_command_signature **signature) { struct d3d12_command_signature *object; bool requires_root_signature = false; + bool requires_state_template = false; uint32_t argument_buffer_offset = 0; uint32_t signature_size = 0; bool has_action = false; @@ -12312,34 +12879,33 @@ HRESULT d3d12_command_signature_create(struct d3d12_device *device, struct d3d12 argument_buffer_offset = signature_size; signature_size += sizeof(D3D12_DISPATCH_MESH_ARGUMENTS); is_action = true; - FIXME("Unsupported indirect dispatch mesh.\n"); break; case D3D12_INDIRECT_ARGUMENT_TYPE_CONSTANT: requires_root_signature = true; + requires_state_template = true; signature_size += argument_desc->Constant.Num32BitValuesToSet * sizeof(uint32_t); - FIXME("Unsupported indirect argument type CONSTANT.\n"); break; case D3D12_INDIRECT_ARGUMENT_TYPE_SHADER_RESOURCE_VIEW: case D3D12_INDIRECT_ARGUMENT_TYPE_UNORDERED_ACCESS_VIEW: case D3D12_INDIRECT_ARGUMENT_TYPE_CONSTANT_BUFFER_VIEW: requires_root_signature = true; + requires_state_template = true; /* The command signature payload is *not* aligned. */ signature_size += sizeof(D3D12_GPU_VIRTUAL_ADDRESS); - FIXME("Unsupported indirect root descriptor type: %u.\n", argument_desc->Type); break; case D3D12_INDIRECT_ARGUMENT_TYPE_VERTEX_BUFFER_VIEW: /* The command signature payload is *not* aligned. */ signature_size += sizeof(D3D12_VERTEX_BUFFER_VIEW); - FIXME("Unsupported indirect VBV.\n"); + requires_state_template = true; break; case D3D12_INDIRECT_ARGUMENT_TYPE_INDEX_BUFFER_VIEW: /* The command signature payload is *not* aligned. */ signature_size += sizeof(D3D12_INDEX_BUFFER_VIEW); - FIXME("Unsupported indirect IBV.\n"); + requires_state_template = true; break; default: @@ -12389,11 +12955,10 @@ HRESULT d3d12_command_signature_create(struct d3d12_device *device, struct d3d12 return E_INVALIDARG; } - if (!(object = vkd3d_malloc(sizeof(*object)))) + if (!(object = vkd3d_calloc(1, sizeof(*object)))) return E_OUTOFMEMORY; object->ID3D12CommandSignature_iface.lpVtbl = &d3d12_command_signature_vtbl; - object->argument_buffer_offset = argument_buffer_offset; object->refcount = 1; object->desc = *desc; @@ -12412,6 +12977,18 @@ HRESULT d3d12_command_signature_create(struct d3d12_device *device, struct d3d12 return hr; } + if ((object->requires_state_template = requires_state_template)) + { + if (FAILED(hr = d3d12_command_signature_init_state_template(object, desc, root_signature, device))) + { + vkd3d_free((void *)object->desc.pArgumentDescs); + vkd3d_free(object); + return hr; + } + } + else + object->argument_buffer_offset = argument_buffer_offset; + d3d12_device_add_ref(object->device = device); TRACE("Created command signature %p.\n", object); diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 8a6c2b49..ebb9febd 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -2383,6 +2383,18 @@ struct d3d12_command_signature D3D12_COMMAND_SIGNATURE_DESC desc; uint32_t argument_buffer_offset; + /* Complex command signatures require some work to stamp out device generated commands. */ + struct + { + VkBuffer buffer; + VkDeviceAddress buffer_va; + struct vkd3d_device_memory_allocation memory; + VkIndirectCommandsLayoutNV layout; + uint32_t stride; + struct vkd3d_execute_indirect_info pipeline; + } state_template; + bool requires_state_template; + struct d3d12_device *device; struct vkd3d_private_store private_store;