v3dv: implement VK_KHR_buffer_device_address

This feature allows shaders to use pointers to buffers which may
not be bound via descriptor sets. Access to these buffers is done
via global intrinsics.

Because the buffers are not accessed through descriptor sets, any
live buffer flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR
can be accessed by any shader using global intrinsics, so the driver
needs to make sure all these buffers are mapped by the kernel when
it submits the job for execution.

We handle this by tracking if any draw call or compute dispatch in
a job uses a pipeline that has any such shaders. If so, the job is
flagged as using buffer device address and the kernel submission
for that job will add all live BOs bound to buffers flagged with the
buffer device address usage flag.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17275>
This commit is contained in:
Iago Toral Quiroga 2022-06-27 14:12:25 +02:00
parent 90054e9c5d
commit bec3c83e19
7 changed files with 147 additions and 6 deletions

View File

@ -447,7 +447,7 @@ Vulkan 1.1 -- all DONE: anv, lvp, radv, tu, vn
Vulkan 1.2 -- all DONE: anv, vn
VK_KHR_8bit_storage DONE (anv/gen8+, lvp, radv, v3dv, vn)
VK_KHR_buffer_device_address DONE (anv/gen8+, lvp, radv, tu, vn)
VK_KHR_buffer_device_address DONE (anv/gen8+, lvp, radv, tu, v3dv, vn)
VK_KHR_create_renderpass2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_depth_stencil_resolve DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_draw_indirect_count DONE (anv, lvp, radv, tu, vn)

View File

@ -44,3 +44,6 @@ dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp
dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat
dEQP-VK.ubo.random.all_out_of_order_offsets.45
dEQP-VK.ubo.random.all_shared_buffer.48
dEQP-VK.ssbo.phys.layout.3*
dEQP-VK.ssbo.phys.layout.single_struct_array*
dEQP-VK.ssbo.phys.layout.basic_unsized_array*

View File

@ -2538,6 +2538,10 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
job->draw_count++;
/* Track VK_KHR_buffer_device_address usage in the job */
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
/* If this job is serialized (has consumed a barrier) then check if we need
* to sync at the binning stage by testing if the binning shaders involved
* with the draw call require access to external resources.
@ -2545,7 +2549,6 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access ||
cmd_buffer->state.barrier.bcl_image_access)) {
assert(!job->needs_bcl_sync);
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline,
indexed, indirect)) {
consume_bcl_sync(cmd_buffer, job);
@ -3721,6 +3724,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
wg_uniform_offsets_out);
submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
/* Track VK_KHR_buffer_device_address usage in the job */
job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
v3dv_job_add_bo(job, uniforms.bo);
return job;

View File

@ -116,6 +116,7 @@ get_device_extensions(const struct v3dv_physical_device *device,
.KHR_8bit_storage = true,
.KHR_16bit_storage = true,
.KHR_bind_memory2 = true,
.KHR_buffer_device_address = true,
.KHR_copy_commands2 = true,
.KHR_create_renderpass2 = true,
.KHR_dedicated_allocation = true,
@ -1203,6 +1204,10 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
.vulkanMemoryModel = true,
.vulkanMemoryModelDeviceScope = true,
.vulkanMemoryModelAvailabilityVisibilityChains = true,
.bufferDeviceAddress = true,
.bufferDeviceAddressCaptureReplay = false,
.bufferDeviceAddressMultiDevice = false,
};
VkPhysicalDeviceVulkan11Features vk11 = {
@ -1975,6 +1980,10 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
device->default_attribute_float =
v3dv_pipeline_create_default_attribute_values(device, NULL);
device->device_address_mem_ctx = ralloc_context(NULL);
util_dynarray_init(&device->device_address_bo_list,
device->device_address_mem_ctx);
*pDevice = v3dv_device_to_handle(device);
return VK_SUCCESS;
@ -2004,6 +2013,8 @@ v3dv_DestroyDevice(VkDevice _device,
device->default_attribute_float = NULL;
}
ralloc_free(device->device_address_mem_ctx);
/* Bo cache should be removed the last, as any other object could be
* freeing their private bos
*/
@ -2203,6 +2214,24 @@ fail_create:
#endif
}
static void
device_add_device_address_bo(struct v3dv_device *device,
struct v3dv_bo *bo)
{
util_dynarray_append(&device->device_address_bo_list,
struct v3dv_bo *,
bo);
}
static void
device_remove_device_address_bo(struct v3dv_device *device,
struct v3dv_bo *bo)
{
util_dynarray_delete_unordered(&device->device_address_bo_list,
struct v3dv_bo *,
bo);
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AllocateMemory(VkDevice _device,
const VkMemoryAllocateInfo *pAllocateInfo,
@ -2229,6 +2258,7 @@ v3dv_AllocateMemory(VkDevice _device,
const struct wsi_memory_allocate_info *wsi_info = NULL;
const VkImportMemoryFdInfoKHR *fd_info = NULL;
const VkMemoryAllocateFlagsInfo *flags_info = NULL;
vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
switch ((unsigned)ext->sType) {
case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA:
@ -2238,9 +2268,7 @@ v3dv_AllocateMemory(VkDevice _device,
fd_info = (void *)ext;
break;
case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO:
/* We don't support VK_KHR_buffer_device_address or multiple
* devices per device group, so we can ignore this.
*/
flags_info = (void *)ext;
break;
case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
/* We don't have particular optimizations associated with memory
@ -2288,6 +2316,20 @@ v3dv_AllocateMemory(VkDevice _device,
return vk_error(device, result);
}
/* If this memory can be used via VK_KHR_buffer_device_address then we
* will need to manually add the BO to any job submit that makes use of
* VK_KHR_buffer_device_address, since such jobs may produde buffer
* load/store operations that may access any buffer memory allocated with
* this flag and we don't have any means to tell which buffers will be
* accessed through this mechanism since they don't even have to be bound
* through descriptor state.
*/
if (flags_info &&
(flags_info->flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR)) {
mem->is_for_device_address = true;
device_add_device_address_bo(device, mem->bo);
}
*pMem = v3dv_device_memory_to_handle(mem);
return result;
}
@ -2306,6 +2348,9 @@ v3dv_FreeMemory(VkDevice _device,
if (mem->bo->map)
v3dv_UnmapMemory(_device, _mem);
if (mem->is_for_device_address)
device_remove_device_address_bo(device, mem->bo);
device_free(device, mem);
vk_object_free(&device->vk, pAllocator, mem);
@ -2844,3 +2889,28 @@ vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion)
*pSupportedVersion = MIN2(*pSupportedVersion, 5u);
return VK_SUCCESS;
}
VkDeviceAddress
v3dv_GetBufferDeviceAddress(VkDevice device,
const VkBufferDeviceAddressInfoKHR *pInfo)
{
V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer);
return buffer->mem_offset + buffer->mem->bo->offset;
}
uint64_t
v3dv_GetBufferOpaqueCaptureAddress(VkDevice device,
const VkBufferDeviceAddressInfoKHR *pInfo)
{
/* Not implemented */
return 0;
}
uint64_t
v3dv_GetDeviceMemoryOpaqueCaptureAddress(
VkDevice device,
const VkDeviceMemoryOpaqueCaptureAddressInfoKHR *pInfo)
{
/* Not implemented */
return 0;
}

View File

@ -178,10 +178,11 @@ static const struct spirv_to_nir_options default_spirv_options = {
.variable_pointers = true,
.vk_memory_model = true,
.vk_memory_model_device_scope = true,
.physical_storage_buffer_address = true,
},
.ubo_addr_format = nir_address_format_32bit_index_offset,
.ssbo_addr_format = nir_address_format_32bit_index_offset,
.phys_ssbo_addr_format = nir_address_format_64bit_global,
.phys_ssbo_addr_format = nir_address_format_2x32bit_global,
.push_const_addr_format = nir_address_format_logical,
.shared_addr_format = nir_address_format_32bit_offset,
};
@ -405,6 +406,10 @@ preprocess_nir(nir_shader *nir)
nir_var_mem_ubo | nir_var_mem_ssbo,
nir_address_format_32bit_index_offset);
NIR_PASS_V(nir, nir_lower_explicit_io,
nir_var_mem_global,
nir_address_format_2x32bit_global);
NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
/* Lower a bunch of stuff */
@ -2320,6 +2325,20 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
return true;
}
static void
pipeline_check_buffer_device_address(struct v3dv_pipeline *pipeline)
{
for (int i = BROADCOM_SHADER_VERTEX; i < BROADCOM_SHADER_STAGES; i++) {
struct v3dv_shader_variant *variant = pipeline->shared_data->variants[i];
if (variant && variant->prog_data.base->has_global_address) {
pipeline->uses_buffer_device_address = true;
return;
}
}
pipeline->uses_buffer_device_address = false;
}
/*
* It compiles a pipeline. Note that it also allocate internal object, but if
* some allocations success, but other fails, the method is not freeing the
@ -2557,6 +2576,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
success:
pipeline_check_buffer_device_address(pipeline);
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
write_creation_feedback(pipeline,
pCreateInfo->pNext,
@ -3220,6 +3241,8 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
success:
pipeline_check_buffer_device_address(pipeline);
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
write_creation_feedback(pipeline,
info->pNext,

View File

@ -513,6 +513,9 @@ struct v3dv_device {
struct v3dv_bo *default_attribute_float;
VkPhysicalDeviceFeatures features;
void *device_address_mem_ctx;
struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */
#ifdef ANDROID
const void *gralloc;
enum {
@ -529,6 +532,7 @@ struct v3dv_device_memory {
struct v3dv_bo *bo;
const VkMemoryType *type;
bool is_for_wsi;
bool is_for_device_address;
};
#define V3D_OUTPUT_IMAGE_FORMAT_NO 255
@ -1059,6 +1063,15 @@ struct v3dv_job {
/* If the job executes on the transfer stage of the pipeline */
bool is_transfer;
/* VK_KHR_buffer_device_address allows shaders to use pointers that can
* dereference memory in any buffer that has been flagged with
* VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR. These buffers may not
* be bound via descriptor sets, so we need to make sure that a job that
* uses this functionality includes all these buffers in its kernel
* submission.
*/
bool uses_buffer_device_address;
enum v3dv_job_type type;
struct v3dv_device *device;
@ -1951,6 +1964,9 @@ struct v3dv_pipeline {
/* Flags for whether optional pipeline stages are present, for convenience */
bool has_gs;
/* Whether any stage in this pipeline uses VK_KHR_buffer_device_address */
bool uses_buffer_device_address;
/* Spilling memory requirements */
struct {
struct v3dv_bo *bo;

View File

@ -770,6 +770,17 @@ handle_cl_job(struct v3dv_queue *queue,
if (job->tmu_dirty_rcl)
submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
/* If the job uses VK_KHR_buffer_device_addess we need to ensure all
* buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR
* are included.
*/
if (job->uses_buffer_device_address) {
util_dynarray_foreach(&queue->device->device_address_bo_list,
struct v3dv_bo *, bo) {
v3dv_job_add_bo(job, *bo);
}
}
submit.bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
@ -913,6 +924,17 @@ handle_csd_job(struct v3dv_queue *queue,
struct drm_v3d_submit_csd *submit = &job->csd.submit;
/* If the job uses VK_KHR_buffer_device_addess we need to ensure all
* buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR
* are included.
*/
if (job->uses_buffer_device_address) {
util_dynarray_foreach(&queue->device->device_address_bo_list,
struct v3dv_bo *, bo) {
v3dv_job_add_bo(job, *bo);
}
}
submit->bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));