/* * Copyright © 2021 Bas Nieuwenhuizen * * SPDX-License-Identifier: MIT */ #include "radv_sqtt.h" #include "meta/radv_meta.h" #include "nir_builder.h" #include "radv_cs.h" #include "radv_entrypoints.h" #include "radix_sort/common/vk/barrier.h" #include "radix_sort/radv_radix_sort.h" #include "radix_sort/shaders/push.h" #include "bvh/build_interface.h" #include "bvh/bvh.h" #include "vk_acceleration_structure.h" #include "vk_common_entrypoints.h" static const uint32_t leaf_spv[] = { #include "bvh/leaf.spv.h" }; static const uint32_t leaf_always_active_spv[] = { #include "bvh/leaf_always_active.spv.h" }; static const uint32_t morton_spv[] = { #include "bvh/morton.spv.h" }; static const uint32_t lbvh_main_spv[] = { #include "bvh/lbvh_main.spv.h" }; static const uint32_t lbvh_generate_ir_spv[] = { #include "bvh/lbvh_generate_ir.spv.h" }; static const uint32_t ploc_spv[] = { #include "bvh/ploc_internal.spv.h" }; static const uint32_t copy_spv[] = { #include "bvh/copy.spv.h" }; static const uint32_t encode_spv[] = { #include "bvh/encode.spv.h" }; static const uint32_t encode_compact_spv[] = { #include "bvh/encode_compact.spv.h" }; static const uint32_t header_spv[] = { #include "bvh/header.spv.h" }; static const uint32_t update_spv[] = { #include "bvh/update.spv.h" }; #define KEY_ID_PAIR_SIZE 8 #define MORTON_BIT_SIZE 24 enum internal_build_type { INTERNAL_BUILD_TYPE_LBVH, INTERNAL_BUILD_TYPE_PLOC, INTERNAL_BUILD_TYPE_UPDATE, }; struct build_config { enum internal_build_type internal_type; bool compact; bool updateable; }; struct acceleration_structure_layout { uint32_t geometry_info_offset; uint32_t bvh_offset; uint32_t leaf_nodes_offset; uint32_t internal_nodes_offset; uint32_t size; }; struct scratch_layout { uint32_t size; uint32_t update_size; uint32_t header_offset; /* Used for UPDATE only. */ uint32_t internal_ready_count_offset; /* Used for BUILD only. */ uint32_t sort_buffer_offset[2]; uint32_t sort_internal_offset; uint32_t ploc_prefix_sum_partition_offset; uint32_t lbvh_node_offset; uint32_t ir_offset; uint32_t internal_node_offset; }; static struct build_config build_config(uint32_t leaf_count, const VkAccelerationStructureBuildGeometryInfoKHR *build_info) { struct build_config config = {0}; if (leaf_count <= 4) config.internal_type = INTERNAL_BUILD_TYPE_LBVH; else if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) config.internal_type = INTERNAL_BUILD_TYPE_PLOC; else if (!(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_BUILD_BIT_KHR) && !(build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR)) config.internal_type = INTERNAL_BUILD_TYPE_PLOC; else config.internal_type = INTERNAL_BUILD_TYPE_LBVH; if (build_info->mode == VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR && build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) config.internal_type = INTERNAL_BUILD_TYPE_UPDATE; if ((build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR) && build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) config.updateable = true; if (build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR) config.compact = true; return config; } static void get_build_layout(struct radv_device *device, uint32_t leaf_count, const VkAccelerationStructureBuildGeometryInfoKHR *build_info, struct acceleration_structure_layout *accel_struct, struct scratch_layout *scratch) { uint32_t internal_count = MAX2(leaf_count, 2) - 1; VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR; if (build_info->geometryCount) { if (build_info->pGeometries) geometry_type = build_info->pGeometries[0].geometryType; else geometry_type = build_info->ppGeometries[0]->geometryType; } uint32_t bvh_leaf_size; switch (geometry_type) { case VK_GEOMETRY_TYPE_TRIANGLES_KHR: bvh_leaf_size = sizeof(struct radv_bvh_triangle_node); break; case VK_GEOMETRY_TYPE_AABBS_KHR: bvh_leaf_size = sizeof(struct radv_bvh_aabb_node); break; case VK_GEOMETRY_TYPE_INSTANCES_KHR: bvh_leaf_size = sizeof(struct radv_bvh_instance_node); break; default: unreachable("Unknown VkGeometryTypeKHR"); } if (accel_struct) { uint64_t bvh_size = bvh_leaf_size * leaf_count + sizeof(struct radv_bvh_box32_node) * internal_count; uint32_t offset = 0; offset += sizeof(struct radv_accel_struct_header); if (device->rra_trace.accel_structs) { accel_struct->geometry_info_offset = offset; offset += sizeof(struct radv_accel_struct_geometry_info) * build_info->geometryCount; } /* Parent links, which have to go directly before bvh_offset as we index them using negative * offsets from there. */ offset += bvh_size / 64 * 4; /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */ offset = ALIGN(offset, 64); accel_struct->bvh_offset = offset; /* root node */ offset += sizeof(struct radv_bvh_box32_node); accel_struct->leaf_nodes_offset = offset; offset += bvh_leaf_size * leaf_count; accel_struct->internal_nodes_offset = offset; /* Factor out the root node. */ offset += sizeof(struct radv_bvh_box32_node) * (internal_count - 1); accel_struct->size = offset; } if (scratch) { radix_sort_vk_memory_requirements_t requirements = { 0, }; if (radv_device_init_accel_struct_build_state(device) == VK_SUCCESS) radix_sort_vk_get_memory_requirements(device->meta_state.accel_struct_build.radix_sort, leaf_count, &requirements); uint32_t offset = 0; uint32_t ploc_scratch_space = 0; uint32_t lbvh_node_space = 0; struct build_config config = build_config(leaf_count, build_info); if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) ploc_scratch_space = DIV_ROUND_UP(leaf_count, PLOC_WORKGROUP_SIZE) * sizeof(struct ploc_prefix_scan_partition); else lbvh_node_space = sizeof(struct lbvh_node_info) * internal_count; scratch->header_offset = offset; offset += sizeof(struct radv_ir_header); scratch->sort_buffer_offset[0] = offset; offset += requirements.keyvals_size; scratch->sort_buffer_offset[1] = offset; offset += requirements.keyvals_size; scratch->sort_internal_offset = offset; /* Internal sorting data is not needed when PLOC/LBVH are invoked, * save space by aliasing them */ scratch->ploc_prefix_sum_partition_offset = offset; scratch->lbvh_node_offset = offset; offset += MAX3(requirements.internal_size, ploc_scratch_space, lbvh_node_space); scratch->ir_offset = offset; offset += sizeof(struct radv_ir_node) * leaf_count; scratch->internal_node_offset = offset; offset += sizeof(struct radv_ir_box_node) * internal_count; scratch->size = offset; if (build_info->type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR) { uint32_t update_offset = 0; update_offset += sizeof(radv_aabb) * leaf_count; scratch->internal_ready_count_offset = update_offset; update_offset += sizeof(uint32_t) * internal_count; scratch->update_size = update_offset; } else { scratch->update_size = offset; } } } VKAPI_ATTR void VKAPI_CALL radv_GetAccelerationStructureBuildSizesKHR(VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType, const VkAccelerationStructureBuildGeometryInfoKHR *pBuildInfo, const uint32_t *pMaxPrimitiveCounts, VkAccelerationStructureBuildSizesInfoKHR *pSizeInfo) { VK_FROM_HANDLE(radv_device, device, _device); STATIC_ASSERT(sizeof(struct radv_bvh_triangle_node) == 64); STATIC_ASSERT(sizeof(struct radv_bvh_aabb_node) == 64); STATIC_ASSERT(sizeof(struct radv_bvh_instance_node) == 128); STATIC_ASSERT(sizeof(struct radv_bvh_box16_node) == 64); STATIC_ASSERT(sizeof(struct radv_bvh_box32_node) == 128); uint32_t leaf_count = 0; for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++) leaf_count += pMaxPrimitiveCounts[i]; struct acceleration_structure_layout accel_struct; struct scratch_layout scratch; get_build_layout(device, leaf_count, pBuildInfo, &accel_struct, &scratch); pSizeInfo->accelerationStructureSize = accel_struct.size; pSizeInfo->updateScratchSize = scratch.update_size; pSizeInfo->buildScratchSize = scratch.size; } VKAPI_ATTR VkResult VKAPI_CALL radv_WriteAccelerationStructuresPropertiesKHR(VkDevice _device, uint32_t accelerationStructureCount, const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType, size_t dataSize, void *pData, size_t stride) { unreachable("Unimplemented"); return VK_ERROR_FEATURE_NOT_PRESENT; } VKAPI_ATTR VkResult VKAPI_CALL radv_BuildAccelerationStructuresKHR(VkDevice _device, VkDeferredOperationKHR deferredOperation, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos) { unreachable("Unimplemented"); return VK_ERROR_FEATURE_NOT_PRESENT; } VKAPI_ATTR VkResult VKAPI_CALL radv_CopyAccelerationStructureKHR(VkDevice _device, VkDeferredOperationKHR deferredOperation, const VkCopyAccelerationStructureInfoKHR *pInfo) { unreachable("Unimplemented"); return VK_ERROR_FEATURE_NOT_PRESENT; } void radv_device_finish_accel_struct_build_state(struct radv_device *device) { VkDevice _device = radv_device_to_handle(device); struct radv_meta_state *state = &device->meta_state; struct vk_device_dispatch_table *dispatch = &device->vk.dispatch_table; dispatch->DestroyPipeline(_device, state->accel_struct_build.copy_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.ploc_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_generate_ir_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.lbvh_main_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.leaf_updateable_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.encode_compact_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.header_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.morton_pipeline, &state->alloc); dispatch->DestroyPipeline(_device, state->accel_struct_build.update_pipeline, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.copy_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.ploc_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_generate_ir_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.lbvh_main_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.leaf_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.encode_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.header_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.morton_p_layout, &state->alloc); radv_DestroyPipelineLayout(_device, state->accel_struct_build.update_p_layout, &state->alloc); if (state->accel_struct_build.radix_sort) radix_sort_vk_destroy(state->accel_struct_build.radix_sort, _device, &state->alloc); radv_DestroyBuffer(_device, state->accel_struct_build.null.buffer, &state->alloc); radv_FreeMemory(_device, state->accel_struct_build.null.memory, &state->alloc); vk_common_DestroyAccelerationStructureKHR(_device, state->accel_struct_build.null.accel_struct, &state->alloc); } static VkResult create_build_pipeline_spv(struct radv_device *device, const uint32_t *spv, uint32_t spv_size, unsigned push_constant_size, VkPipeline *pipeline, VkPipelineLayout *layout) { if (*pipeline) return VK_SUCCESS; VkDevice _device = radv_device_to_handle(device); const VkPipelineLayoutCreateInfo pl_create_info = { .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .setLayoutCount = 0, .pushConstantRangeCount = 1, .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constant_size}, }; VkShaderModuleCreateInfo module_info = { .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = NULL, .flags = 0, .codeSize = spv_size, .pCode = spv, }; VkShaderModule module; VkResult result = device->vk.dispatch_table.CreateShaderModule(_device, &module_info, &device->meta_state.alloc, &module); if (result != VK_SUCCESS) return result; if (!*layout) { result = radv_CreatePipelineLayout(_device, &pl_create_info, &device->meta_state.alloc, layout); if (result != VK_SUCCESS) goto cleanup; } VkPipelineShaderStageCreateInfo shader_stage = { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = module, .pName = "main", .pSpecializationInfo = NULL, }; VkComputePipelineCreateInfo pipeline_info = { .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .stage = shader_stage, .flags = 0, .layout = *layout, }; result = device->vk.dispatch_table.CreateComputePipelines(_device, device->meta_state.cache, 1, &pipeline_info, &device->meta_state.alloc, pipeline); cleanup: device->vk.dispatch_table.DestroyShaderModule(_device, module, &device->meta_state.alloc); return result; } VkResult radv_device_init_null_accel_struct(struct radv_device *device) { const struct radv_physical_device *pdev = radv_device_physical(device); if (pdev->memory_properties.memoryTypeCount == 0) return VK_SUCCESS; /* Exit in the case of null winsys. */ VkDevice _device = radv_device_to_handle(device); uint32_t bvh_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64); uint32_t size = bvh_offset + sizeof(struct radv_bvh_box32_node); VkResult result; VkBuffer buffer = VK_NULL_HANDLE; VkDeviceMemory memory = VK_NULL_HANDLE; VkAccelerationStructureKHR accel_struct = VK_NULL_HANDLE; VkBufferCreateInfo buffer_create_info = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = &(VkBufferUsageFlags2CreateInfoKHR){ .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR, .usage = VK_BUFFER_USAGE_2_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR, }, .size = size, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, }; result = radv_CreateBuffer(_device, &buffer_create_info, &device->meta_state.alloc, &buffer); if (result != VK_SUCCESS) return result; VkBufferMemoryRequirementsInfo2 info = { .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2, .buffer = buffer, }; VkMemoryRequirements2 mem_req = { .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, }; vk_common_GetBufferMemoryRequirements2(_device, &info, &mem_req); VkMemoryAllocateInfo alloc_info = { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .allocationSize = mem_req.memoryRequirements.size, .memoryTypeIndex = radv_find_memory_index(pdev, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT), }; result = radv_AllocateMemory(_device, &alloc_info, &device->meta_state.alloc, &memory); if (result != VK_SUCCESS) return result; VkBindBufferMemoryInfo bind_info = { .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, .buffer = buffer, .memory = memory, }; result = radv_BindBufferMemory2(_device, 1, &bind_info); if (result != VK_SUCCESS) return result; void *data; result = vk_common_MapMemory(_device, memory, 0, size, 0, &data); if (result != VK_SUCCESS) return result; struct radv_accel_struct_header header = { .bvh_offset = bvh_offset, }; memcpy(data, &header, sizeof(struct radv_accel_struct_header)); struct radv_bvh_box32_node root = { .children = { RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE, RADV_BVH_INVALID_NODE, }, }; for (uint32_t child = 0; child < 4; child++) { root.coords[child] = (radv_aabb){ .min.x = NAN, .min.y = NAN, .min.z = NAN, .max.x = NAN, .max.y = NAN, .max.z = NAN, }; } memcpy((uint8_t *)data + bvh_offset, &root, sizeof(struct radv_bvh_box32_node)); vk_common_UnmapMemory(_device, memory); VkAccelerationStructureCreateInfoKHR create_info = { .sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR, .buffer = buffer, .size = size, .type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR, }; result = vk_common_CreateAccelerationStructureKHR(_device, &create_info, &device->meta_state.alloc, &accel_struct); if (result != VK_SUCCESS) return result; device->meta_state.accel_struct_build.null.buffer = buffer; device->meta_state.accel_struct_build.null.memory = memory; device->meta_state.accel_struct_build.null.accel_struct = accel_struct; return VK_SUCCESS; } VkResult radv_device_init_accel_struct_build_state(struct radv_device *device) { VkResult result = VK_SUCCESS; mtx_lock(&device->meta_state.mtx); if (device->meta_state.accel_struct_build.radix_sort) goto exit; result = create_build_pipeline_spv(device, leaf_always_active_spv, sizeof(leaf_always_active_spv), sizeof(struct leaf_args), &device->meta_state.accel_struct_build.leaf_updateable_pipeline, &device->meta_state.accel_struct_build.leaf_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, leaf_spv, sizeof(leaf_spv), sizeof(struct leaf_args), &device->meta_state.accel_struct_build.leaf_pipeline, &device->meta_state.accel_struct_build.leaf_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, lbvh_main_spv, sizeof(lbvh_main_spv), sizeof(struct lbvh_main_args), &device->meta_state.accel_struct_build.lbvh_main_pipeline, &device->meta_state.accel_struct_build.lbvh_main_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, lbvh_generate_ir_spv, sizeof(lbvh_generate_ir_spv), sizeof(struct lbvh_generate_ir_args), &device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline, &device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, ploc_spv, sizeof(ploc_spv), sizeof(struct ploc_args), &device->meta_state.accel_struct_build.ploc_pipeline, &device->meta_state.accel_struct_build.ploc_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args), &device->meta_state.accel_struct_build.encode_pipeline, &device->meta_state.accel_struct_build.encode_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, encode_compact_spv, sizeof(encode_compact_spv), sizeof(struct encode_args), &device->meta_state.accel_struct_build.encode_compact_pipeline, &device->meta_state.accel_struct_build.encode_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, header_spv, sizeof(header_spv), sizeof(struct header_args), &device->meta_state.accel_struct_build.header_pipeline, &device->meta_state.accel_struct_build.header_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, morton_spv, sizeof(morton_spv), sizeof(struct morton_args), &device->meta_state.accel_struct_build.morton_pipeline, &device->meta_state.accel_struct_build.morton_p_layout); if (result != VK_SUCCESS) goto exit; result = create_build_pipeline_spv(device, update_spv, sizeof(update_spv), sizeof(struct update_args), &device->meta_state.accel_struct_build.update_pipeline, &device->meta_state.accel_struct_build.update_p_layout); if (result != VK_SUCCESS) goto exit; device->meta_state.accel_struct_build.radix_sort = radv_create_radix_sort_u64(radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache); exit: mtx_unlock(&device->meta_state.mtx); return result; } static VkResult radv_device_init_accel_struct_copy_state(struct radv_device *device) { mtx_lock(&device->meta_state.mtx); VkResult result = create_build_pipeline_spv(device, copy_spv, sizeof(copy_spv), sizeof(struct copy_args), &device->meta_state.accel_struct_build.copy_pipeline, &device->meta_state.accel_struct_build.copy_p_layout); mtx_unlock(&device->meta_state.mtx); return result; } struct bvh_state { uint32_t node_count; uint32_t scratch_offset; uint32_t leaf_node_count; uint32_t internal_node_count; uint32_t leaf_node_size; struct acceleration_structure_layout accel_struct; struct scratch_layout scratch; struct build_config config; /* Radix sort state */ uint32_t scatter_blocks; uint32_t count_ru_scatter; uint32_t histo_blocks; uint32_t count_ru_histo; struct rs_push_scatter push_scatter; }; struct radv_bvh_batch_state { bool any_compact; bool any_non_compact; bool any_ploc; bool any_lbvh; bool any_updateable; bool any_non_updateable; bool any_update; }; static uint32_t pack_geometry_id_and_flags(uint32_t geometry_id, uint32_t flags) { uint32_t geometry_id_and_flags = geometry_id; if (flags & VK_GEOMETRY_OPAQUE_BIT_KHR) geometry_id_and_flags |= RADV_GEOMETRY_OPAQUE; return geometry_id_and_flags; } static struct radv_bvh_geometry_data fill_geometry_data(VkAccelerationStructureTypeKHR type, struct bvh_state *bvh_state, uint32_t geom_index, const VkAccelerationStructureGeometryKHR *geometry, const VkAccelerationStructureBuildRangeInfoKHR *build_range_info) { struct radv_bvh_geometry_data data = { .first_id = bvh_state->node_count, .geometry_id = pack_geometry_id_and_flags(geom_index, geometry->flags), .geometry_type = geometry->geometryType, }; switch (geometry->geometryType) { case VK_GEOMETRY_TYPE_TRIANGLES_KHR: assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR); data.data = geometry->geometry.triangles.vertexData.deviceAddress + build_range_info->firstVertex * geometry->geometry.triangles.vertexStride; data.indices = geometry->geometry.triangles.indexData.deviceAddress; if (geometry->geometry.triangles.indexType == VK_INDEX_TYPE_NONE_KHR) data.data += build_range_info->primitiveOffset; else data.indices += build_range_info->primitiveOffset; data.transform = geometry->geometry.triangles.transformData.deviceAddress; if (data.transform) data.transform += build_range_info->transformOffset; data.stride = geometry->geometry.triangles.vertexStride; data.vertex_format = geometry->geometry.triangles.vertexFormat; data.index_format = geometry->geometry.triangles.indexType; break; case VK_GEOMETRY_TYPE_AABBS_KHR: assert(type == VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR); data.data = geometry->geometry.aabbs.data.deviceAddress + build_range_info->primitiveOffset; data.stride = geometry->geometry.aabbs.stride; break; case VK_GEOMETRY_TYPE_INSTANCES_KHR: assert(type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR); data.data = geometry->geometry.instances.data.deviceAddress + build_range_info->primitiveOffset; if (geometry->geometry.instances.arrayOfPointers) data.stride = 8; else data.stride = sizeof(VkAccelerationStructureInstanceKHR); break; default: unreachable("Unknown geometryType"); } return data; } static void build_leaves(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states, bool updateable) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); radv_write_user_event_marker(cmd_buffer, UserEventPush, "leaves"); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, updateable ? device->meta_state.accel_struct_build.leaf_updateable_pipeline : device->meta_state.accel_struct_build.leaf_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; if (bvh_states[i].config.updateable != updateable) continue; VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); struct leaf_args leaf_consts = { .ir = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, .bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.leaf_nodes_offset, .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0], }; for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) { const VkAccelerationStructureGeometryKHR *geom = pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j]; const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j]; leaf_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info); vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.leaf_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(leaf_consts), &leaf_consts); radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1); bvh_states[i].leaf_node_count += build_range_info->primitiveCount; bvh_states[i].node_count += build_range_info->primitiveCount; } } radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); } static void morton_generate(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, enum radv_cmd_flush_bits flush_bits) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); radv_write_user_event_marker(cmd_buffer, UserEventPush, "morton"); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.morton_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; const struct morton_args consts = { .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, .ids = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0], }; vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.morton_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); radv_unaligned_dispatch(cmd_buffer, bvh_states[i].node_count, 1, 1); } radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); cmd_buffer->state.flush_bits |= flush_bits; } static void morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, enum radv_cmd_flush_bits flush_bits) { /* Copyright 2019 The Fuchsia Authors. */ VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); radv_write_user_event_marker(cmd_buffer, UserEventPush, "sort"); radix_sort_vk_t *rs = device->meta_state.accel_struct_build.radix_sort; /* * OVERVIEW * * 1. Pad the keyvals in `scatter_even`. * 2. Zero the `histograms` and `partitions`. * --- BARRIER --- * 3. HISTOGRAM is dispatched before PREFIX. * --- BARRIER --- * 4. PREFIX is dispatched before the first SCATTER. * --- BARRIER --- * 5. One or more SCATTER dispatches. * * Note that the `partitions` buffer can be zeroed anytime before the first * scatter. */ /* How many passes? */ uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t); uint32_t keyval_bits = keyval_bytes * 8; uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits); uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2; for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].node_count) bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1]; else bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0]; } /* * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS * * Pad fractional blocks with max-valued keyvals. * * Zero the histograms and partitions buffer. * * This assumes the partitions follow the histograms. */ /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */ /* How many scatter blocks? */ uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2; uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows; /* * How many histogram blocks? * * Note that it's OK to have more max-valued digits counted by the histogram * than sorted by the scatters because the sort is stable. */ uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2; uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows; uint32_t pass_idx = (keyval_bytes - passes); for (uint32_t i = 0; i < infoCount; ++i) { if (!bvh_states[i].node_count) continue; if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; bvh_states[i].scatter_blocks = (bvh_states[i].node_count + scatter_block_kvs - 1) / scatter_block_kvs; bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs; bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs; bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs; /* Fill with max values */ if (bvh_states[i].count_ru_histo > bvh_states[i].node_count) { radv_fill_buffer(cmd_buffer, NULL, NULL, keyvals_even_addr + bvh_states[i].node_count * keyval_bytes, (bvh_states[i].count_ru_histo - bvh_states[i].node_count) * keyval_bytes, 0xFFFFFFFF); } /* * Zero histograms and invalidate partitions. * * Note that the partition invalidation only needs to be performed once * because the even/odd scatter dispatches rely on the the previous pass to * leave the partitions in an invalid state. * * Note that the last workgroup doesn't read/write a partition so it doesn't * need to be initialized. */ uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1; uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); radv_fill_buffer(cmd_buffer, NULL, NULL, internal_addr + rs->internal.histograms.offset + fill_base, histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)), 0); } /* * Pipeline: HISTOGRAM * * TODO(allanmac): All subgroups should try to process approximately the same * number of blocks in order to minimize tail effects. This was implemented * and reverted but should be reimplemented and benchmarked later. */ vk_barrier_transfer_w_to_compute_r(commandBuffer); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); for (uint32_t i = 0; i < infoCount; ++i) { if (!bvh_states[i].node_count) continue; if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; /* Dispatch histogram */ struct rs_push_histogram push_histogram = { .devaddr_histograms = internal_addr + rs->internal.histograms.offset, .devaddr_keyvals = keyvals_even_addr, .passes = passes, }; vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_histogram), &push_histogram); vk_common_CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1); } /* * Pipeline: PREFIX * * Launch one workgroup per pass. */ vk_barrier_compute_w_to_compute_r(commandBuffer); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); for (uint32_t i = 0; i < infoCount; ++i) { if (!bvh_states[i].node_count) continue; if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; struct rs_push_prefix push_prefix = { .devaddr_histograms = internal_addr + rs->internal.histograms.offset, }; vk_common_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_prefix), &push_prefix); vk_common_CmdDispatch(commandBuffer, passes, 1, 1); } /* Pipeline: SCATTER */ vk_barrier_compute_w_to_compute_r(commandBuffer); uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); for (uint32_t i = 0; i < infoCount; i++) { uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1]; uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; bvh_states[i].push_scatter = (struct rs_push_scatter){ .devaddr_keyvals_even = keyvals_even_addr, .devaddr_keyvals_odd = keyvals_odd_addr, .devaddr_partitions = internal_addr + rs->internal.partitions.offset, .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset, }; } bool is_even = true; while (true) { uint32_t pass_dword = pass_idx / 4; /* Bind new pipeline */ VkPipeline p = is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd; device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p); /* Update push constants that changed */ VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even : rs->pipeline_layouts.named.scatter[pass_dword].odd; for (uint32_t i = 0; i < infoCount; i++) { if (!bvh_states[i].node_count) continue; if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2; vk_common_CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter), &bvh_states[i].push_scatter); vk_common_CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1); bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t)); } /* Continue? */ if (++pass_idx >= keyval_bytes) break; vk_barrier_compute_w_to_compute_r(commandBuffer); is_even ^= true; } radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); cmd_buffer->state.flush_bits |= flush_bits; } static void lbvh_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, enum radv_cmd_flush_bits flush_bits) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); radv_write_user_event_marker(cmd_buffer, UserEventPush, "lbvh"); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.lbvh_main_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH) continue; uint32_t src_scratch_offset = bvh_states[i].scratch_offset; uint32_t internal_node_count = MAX2(bvh_states[i].node_count, 2) - 1; const struct lbvh_main_args consts = { .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, .src_ids = pInfos[i].scratchData.deviceAddress + src_scratch_offset, .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset, .id_count = bvh_states[i].node_count, .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, }; vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_main_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); radv_unaligned_dispatch(cmd_buffer, internal_node_count, 1, 1); bvh_states[i].node_count = internal_node_count; bvh_states[i].internal_node_count = internal_node_count; } cmd_buffer->state.flush_bits |= flush_bits; device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.lbvh_generate_ir_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_LBVH) continue; const struct lbvh_generate_ir_args consts = { .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, .node_info = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.lbvh_node_offset, .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, .internal_node_base = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, }; vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.lbvh_generate_ir_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); radv_unaligned_dispatch(cmd_buffer, bvh_states[i].internal_node_count, 1, 1); } radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); } static void ploc_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); radv_write_user_event_marker(cmd_buffer, UserEventPush, "ploc"); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.ploc_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC) continue; uint32_t src_scratch_offset = bvh_states[i].scratch_offset; uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0]) ? bvh_states[i].scratch.sort_buffer_offset[1] : bvh_states[i].scratch.sort_buffer_offset[0]; const struct ploc_args consts = { .bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, .ids_0 = pInfos[i].scratchData.deviceAddress + src_scratch_offset, .ids_1 = pInfos[i].scratchData.deviceAddress + dst_scratch_offset, .prefix_scan_partitions = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ploc_prefix_sum_partition_offset, .internal_node_offset = bvh_states[i].scratch.internal_node_offset - bvh_states[i].scratch.ir_offset, }; vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.ploc_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); vk_common_CmdDispatch(commandBuffer, MAX2(DIV_ROUND_UP(bvh_states[i].node_count, PLOC_WORKGROUP_SIZE), 1), 1, 1); } radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); } static void encode_nodes(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, bool compact) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); radv_write_user_event_marker(cmd_buffer, UserEventPush, "encode"); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, compact ? device->meta_state.accel_struct_build.encode_compact_pipeline : device->meta_state.accel_struct_build.encode_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { if (compact != bvh_states[i].config.compact) continue; if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); VkGeometryTypeKHR geometry_type = VK_GEOMETRY_TYPE_TRIANGLES_KHR; /* If the geometry count is 0, then the size does not matter * because it will be multiplied with 0. */ if (pInfos[i].geometryCount) geometry_type = pInfos[i].pGeometries ? pInfos[i].pGeometries[0].geometryType : pInfos[i].ppGeometries[0]->geometryType; if (bvh_states[i].config.compact) { uint32_t dst_offset = bvh_states[i].accel_struct.internal_nodes_offset - bvh_states[i].accel_struct.bvh_offset; radv_update_buffer_cp(cmd_buffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset + offsetof(struct radv_ir_header, dst_node_offset), &dst_offset, sizeof(uint32_t)); } const struct encode_args args = { .intermediate_bvh = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.ir_offset, .output_bvh = vk_acceleration_structure_get_va(accel_struct) + bvh_states[i].accel_struct.bvh_offset, .header = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, .output_bvh_offset = bvh_states[i].accel_struct.bvh_offset, .leaf_node_count = bvh_states[i].leaf_node_count, .geometry_type = geometry_type, }; vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.encode_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); struct radv_dispatch_info dispatch = { .unaligned = true, .ordered = true, .va = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset + offsetof(struct radv_ir_header, ir_internal_node_count), }; radv_compute_dispatch(cmd_buffer, &dispatch); } /* This is the final access to the leaf nodes, no need to flush */ radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); } static void init_header(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, struct radv_bvh_batch_state *batch_state) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); if (batch_state->any_compact) { radv_write_user_event_marker(cmd_buffer, UserEventPush, "header"); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.header_pipeline); } for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); size_t base = offsetof(struct radv_accel_struct_header, compacted_size); uint64_t instance_count = pInfos[i].type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR ? bvh_states[i].leaf_node_count : 0; if (bvh_states[i].config.compact) { base = offsetof(struct radv_accel_struct_header, geometry_count); struct header_args args = { .src = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, .dst = vk_acceleration_structure_get_va(accel_struct), .bvh_offset = bvh_states[i].accel_struct.bvh_offset, .instance_count = instance_count, }; vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.header_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(args), &args); radv_unaligned_dispatch(cmd_buffer, 1, 1, 1); } struct radv_accel_struct_header header; header.instance_offset = bvh_states[i].accel_struct.bvh_offset + sizeof(struct radv_bvh_box32_node); header.instance_count = instance_count; header.compacted_size = bvh_states[i].accel_struct.size; header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64); header.copy_dispatch_size[1] = 1; header.copy_dispatch_size[2] = 1; header.serialization_size = header.compacted_size + align(sizeof(struct radv_accel_struct_serialization_header) + sizeof(uint64_t) * header.instance_count, 128); header.size = header.serialization_size - sizeof(struct radv_accel_struct_serialization_header) - sizeof(uint64_t) * header.instance_count; header.build_flags = pInfos[i].flags; header.geometry_count = pInfos[i].geometryCount; radv_update_buffer_cp(cmd_buffer, vk_acceleration_structure_get_va(accel_struct) + base, (const char *)&header + base, sizeof(header) - base); } if (batch_state->any_compact) radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); } static void init_geometry_infos(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos) { for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) continue; VK_FROM_HANDLE(vk_acceleration_structure, accel_struct, pInfos[i].dstAccelerationStructure); uint64_t geometry_infos_size = pInfos[i].geometryCount * sizeof(struct radv_accel_struct_geometry_info); struct radv_accel_struct_geometry_info *geometry_infos = malloc(geometry_infos_size); if (!geometry_infos) continue; for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) { const VkAccelerationStructureGeometryKHR *geometry = pInfos[i].pGeometries ? pInfos[i].pGeometries + j : pInfos[i].ppGeometries[j]; geometry_infos[j].type = geometry->geometryType; geometry_infos[j].flags = geometry->flags; geometry_infos[j].primitive_count = ppBuildRangeInfos[i][j].primitiveCount; } radv_CmdUpdateBuffer(commandBuffer, accel_struct->buffer, accel_struct->offset + bvh_states[i].accel_struct.geometry_info_offset, geometry_infos_size, geometry_infos); free(geometry_infos); } } static void update(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos, struct bvh_state *bvh_states) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); radv_write_user_event_marker(cmd_buffer, UserEventPush, "update"); device->vk.dispatch_table.CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.update_pipeline); for (uint32_t i = 0; i < infoCount; ++i) { if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) continue; uint32_t leaf_node_count = 0; for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) { leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount; } VK_FROM_HANDLE(vk_acceleration_structure, src_bvh, pInfos[i].srcAccelerationStructure); VK_FROM_HANDLE(vk_acceleration_structure, dst_bvh, pInfos[i].dstAccelerationStructure); struct update_args update_consts = { .src = vk_acceleration_structure_get_va(src_bvh), .dst = vk_acceleration_structure_get_va(dst_bvh), .leaf_bounds = pInfos[i].scratchData.deviceAddress, .internal_ready_count = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset, .leaf_node_count = leaf_node_count, }; for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) { const VkAccelerationStructureGeometryKHR *geom = pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j]; const VkAccelerationStructureBuildRangeInfoKHR *build_range_info = &ppBuildRangeInfos[i][j]; update_consts.geom_data = fill_geometry_data(pInfos[i].type, &bvh_states[i], j, geom, build_range_info); vk_common_CmdPushConstants(commandBuffer, device->meta_state.accel_struct_build.update_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(update_consts), &update_consts); radv_unaligned_dispatch(cmd_buffer, build_range_info->primitiveCount, 1, 1); bvh_states[i].leaf_node_count += build_range_info->primitiveCount; bvh_states[i].node_count += build_range_info->primitiveCount; } } radv_write_user_event_marker(cmd_buffer, UserEventPop, NULL); } VKAPI_ATTR void VKAPI_CALL radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); struct radv_meta_saved_state saved_state; VkResult result = radv_device_init_accel_struct_build_state(device); if (result != VK_SUCCESS) { vk_command_buffer_set_error(&cmd_buffer->vk, result); return; } enum radv_cmd_flush_bits flush_bits = RADV_CMD_FLAG_CS_PARTIAL_FLUSH | radv_src_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL) | radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_SHADER_WRITE_BIT, NULL); radv_meta_save(&saved_state, cmd_buffer, RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS); struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state)); radv_describe_begin_accel_struct_build(cmd_buffer, infoCount); struct radv_bvh_batch_state batch_state = {0}; for (uint32_t i = 0; i < infoCount; ++i) { uint32_t leaf_node_count = 0; for (uint32_t j = 0; j < pInfos[i].geometryCount; ++j) { leaf_node_count += ppBuildRangeInfos[i][j].primitiveCount; } get_build_layout(device, leaf_node_count, pInfos + i, &bvh_states[i].accel_struct, &bvh_states[i].scratch); struct build_config config = build_config(leaf_node_count, pInfos + i); bvh_states[i].config = config; if (config.compact) batch_state.any_compact = true; else batch_state.any_non_compact = true; if (config.updateable) batch_state.any_updateable = true; else batch_state.any_non_updateable = true; if (config.internal_type == INTERNAL_BUILD_TYPE_PLOC) { batch_state.any_ploc = true; } else if (config.internal_type == INTERNAL_BUILD_TYPE_LBVH) { batch_state.any_lbvh = true; } else if (config.internal_type == INTERNAL_BUILD_TYPE_UPDATE) { batch_state.any_update = true; } else { unreachable("Unknown internal_build_type"); } if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_UPDATE) { /* The internal node count is updated in lbvh_build_internal for LBVH * and from the PLOC shader for PLOC. */ struct radv_ir_header header = { .min_bounds = {0x7fffffff, 0x7fffffff, 0x7fffffff}, .max_bounds = {0x80000000, 0x80000000, 0x80000000}, .dispatch_size_y = 1, .dispatch_size_z = 1, .sync_data = { .current_phase_end_counter = TASK_INDEX_INVALID, /* Will be updated by the first PLOC shader invocation */ .task_counts = {TASK_INDEX_INVALID, TASK_INDEX_INVALID}, }, }; radv_update_buffer_cp(cmd_buffer, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.header_offset, &header, sizeof(header)); } else { /* Prepare ready counts for internal nodes */ radv_fill_buffer(cmd_buffer, NULL, NULL, pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.internal_ready_count_offset, bvh_states[i].scratch.update_size - bvh_states[i].scratch.internal_ready_count_offset, 0x0); if (pInfos[i].srcAccelerationStructure != pInfos[i].dstAccelerationStructure) { VK_FROM_HANDLE(vk_acceleration_structure, src_as, pInfos[i].srcAccelerationStructure); VK_FROM_HANDLE(vk_acceleration_structure, dst_as, pInfos[i].dstAccelerationStructure); VK_FROM_HANDLE(radv_buffer, src_as_buffer, src_as->buffer); VK_FROM_HANDLE(radv_buffer, dst_as_buffer, dst_as->buffer); /* Copy header/metadata */ radv_copy_buffer(cmd_buffer, src_as_buffer->bo, dst_as_buffer->bo, src_as_buffer->offset + src_as->offset, dst_as_buffer->offset + dst_as->offset, bvh_states[i].accel_struct.bvh_offset); } } } cmd_buffer->state.current_event_type = EventInternalUnknown; if (batch_state.any_lbvh || batch_state.any_ploc) { if (batch_state.any_non_updateable) build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, false); if (batch_state.any_updateable) build_leaves(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states, true); cmd_buffer->state.flush_bits |= flush_bits; morton_generate(commandBuffer, infoCount, pInfos, bvh_states, flush_bits); morton_sort(commandBuffer, infoCount, pInfos, bvh_states, flush_bits); cmd_buffer->state.flush_bits |= flush_bits; if (batch_state.any_lbvh) lbvh_build_internal(commandBuffer, infoCount, pInfos, bvh_states, flush_bits); if (batch_state.any_ploc) ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states); cmd_buffer->state.flush_bits |= flush_bits; if (batch_state.any_non_compact) encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, false); if (batch_state.any_compact) encode_nodes(commandBuffer, infoCount, pInfos, bvh_states, true); cmd_buffer->state.flush_bits |= flush_bits; } init_header(commandBuffer, infoCount, pInfos, bvh_states, &batch_state); if (device->rra_trace.accel_structs) init_geometry_infos(commandBuffer, infoCount, pInfos, bvh_states, ppBuildRangeInfos); if (batch_state.any_update) update(commandBuffer, infoCount, pInfos, ppBuildRangeInfos, bvh_states); radv_describe_end_accel_struct_build(cmd_buffer); free(bvh_states); radv_meta_restore(&saved_state, cmd_buffer); } VKAPI_ATTR void VKAPI_CALL radv_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer, const VkCopyAccelerationStructureInfoKHR *pInfo) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); VK_FROM_HANDLE(radv_buffer, src_buffer, src->buffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); struct radv_meta_saved_state saved_state; VkResult result = radv_device_init_accel_struct_copy_state(device); if (result != VK_SUCCESS) { vk_command_buffer_set_error(&cmd_buffer->vk, result); return; } radv_meta_save(&saved_state, cmd_buffer, RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS); radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.copy_pipeline); struct copy_args consts = { .src_addr = vk_acceleration_structure_get_va(src), .dst_addr = vk_acceleration_structure_get_va(dst), .mode = RADV_COPY_MODE_COPY, }; vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.accel_struct_build.copy_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT, NULL); radv_indirect_dispatch( cmd_buffer, src_buffer->bo, vk_acceleration_structure_get_va(src) + offsetof(struct radv_accel_struct_header, copy_dispatch_size)); radv_meta_restore(&saved_state, cmd_buffer); } VKAPI_ATTR void VKAPI_CALL radv_GetDeviceAccelerationStructureCompatibilityKHR(VkDevice _device, const VkAccelerationStructureVersionInfoKHR *pVersionInfo, VkAccelerationStructureCompatibilityKHR *pCompatibility) { VK_FROM_HANDLE(radv_device, device, _device); const struct radv_physical_device *pdev = radv_device_physical(device); bool compat = memcmp(pVersionInfo->pVersionData, pdev->driver_uuid, VK_UUID_SIZE) == 0 && memcmp(pVersionInfo->pVersionData + VK_UUID_SIZE, pdev->cache_uuid, VK_UUID_SIZE) == 0; *pCompatibility = compat ? VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR : VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR; } VKAPI_ATTR VkResult VKAPI_CALL radv_CopyMemoryToAccelerationStructureKHR(VkDevice _device, VkDeferredOperationKHR deferredOperation, const VkCopyMemoryToAccelerationStructureInfoKHR *pInfo) { unreachable("Unimplemented"); return VK_ERROR_FEATURE_NOT_PRESENT; } VKAPI_ATTR VkResult VKAPI_CALL radv_CopyAccelerationStructureToMemoryKHR(VkDevice _device, VkDeferredOperationKHR deferredOperation, const VkCopyAccelerationStructureToMemoryInfoKHR *pInfo) { unreachable("Unimplemented"); return VK_ERROR_FEATURE_NOT_PRESENT; } VKAPI_ATTR void VKAPI_CALL radv_CmdCopyMemoryToAccelerationStructureKHR(VkCommandBuffer commandBuffer, const VkCopyMemoryToAccelerationStructureInfoKHR *pInfo) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(vk_acceleration_structure, dst, pInfo->dst); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); struct radv_meta_saved_state saved_state; VkResult result = radv_device_init_accel_struct_copy_state(device); if (result != VK_SUCCESS) { vk_command_buffer_set_error(&cmd_buffer->vk, result); return; } radv_meta_save(&saved_state, cmd_buffer, RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS); radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.copy_pipeline); const struct copy_args consts = { .src_addr = pInfo->src.deviceAddress, .dst_addr = vk_acceleration_structure_get_va(dst), .mode = RADV_COPY_MODE_DESERIALIZE, }; vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.accel_struct_build.copy_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); vk_common_CmdDispatch(commandBuffer, 512, 1, 1); radv_meta_restore(&saved_state, cmd_buffer); } VKAPI_ATTR void VKAPI_CALL radv_CmdCopyAccelerationStructureToMemoryKHR(VkCommandBuffer commandBuffer, const VkCopyAccelerationStructureToMemoryInfoKHR *pInfo) { VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); VK_FROM_HANDLE(vk_acceleration_structure, src, pInfo->src); VK_FROM_HANDLE(radv_buffer, src_buffer, src->buffer); struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); const struct radv_physical_device *pdev = radv_device_physical(device); struct radv_meta_saved_state saved_state; VkResult result = radv_device_init_accel_struct_copy_state(device); if (result != VK_SUCCESS) { vk_command_buffer_set_error(&cmd_buffer->vk, result); return; } radv_meta_save(&saved_state, cmd_buffer, RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS); radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.accel_struct_build.copy_pipeline); const struct copy_args consts = { .src_addr = vk_acceleration_structure_get_va(src), .dst_addr = pInfo->dst.deviceAddress, .mode = RADV_COPY_MODE_SERIALIZE, }; vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.accel_struct_build.copy_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT, NULL); radv_indirect_dispatch( cmd_buffer, src_buffer->bo, vk_acceleration_structure_get_va(src) + offsetof(struct radv_accel_struct_header, copy_dispatch_size)); radv_meta_restore(&saved_state, cmd_buffer); /* Set the header of the serialized data. */ uint8_t header_data[2 * VK_UUID_SIZE]; memcpy(header_data, pdev->driver_uuid, VK_UUID_SIZE); memcpy(header_data + VK_UUID_SIZE, pdev->cache_uuid, VK_UUID_SIZE); radv_update_buffer_cp(cmd_buffer, pInfo->dst.deviceAddress, header_data, sizeof(header_data)); } VKAPI_ATTR void VKAPI_CALL radv_CmdBuildAccelerationStructuresIndirectKHR(VkCommandBuffer commandBuffer, uint32_t infoCount, const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, const VkDeviceAddress *pIndirectDeviceAddresses, const uint32_t *pIndirectStrides, const uint32_t *const *ppMaxPrimitiveCounts) { unreachable("Unimplemented"); }