diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index 0b6fd62c38f..d9b5631777a 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -48,6 +48,7 @@ libtu_files = files( 'tu_private.h', 'tu_query.c', 'tu_shader.c', + 'tu_suballoc.c', 'tu_util.c', 'tu_util.h', 'tu_perfetto.h', diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c index bf1c07a8a7d..042d0f06d5e 100644 --- a/src/freedreno/vulkan/tu_autotune.c +++ b/src/freedreno/vulkan/tu_autotune.c @@ -711,4 +711,4 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); tu_cs_emit(cs, ZPASS_DONE); -} \ No newline at end of file +} diff --git a/src/freedreno/vulkan/tu_cs.c b/src/freedreno/vulkan/tu_cs.c index c95ffdcfaab..73b80758ca9 100644 --- a/src/freedreno/vulkan/tu_cs.c +++ b/src/freedreno/vulkan/tu_cs.c @@ -56,6 +56,25 @@ tu_cs_init_external(struct tu_cs *cs, struct tu_device *device, cs->end = end; } +/** + * Initialize a sub-command stream as a wrapper to an externally sub-allocated + * buffer. + */ +void +tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device, + struct tu_suballoc_bo *suballoc_bo) +{ + uint32_t *start = tu_suballoc_bo_map(suballoc_bo); + uint32_t *end = start + (suballoc_bo->size >> 2); + + memset(cs, 0, sizeof(*cs)); + cs->device = device; + cs->mode = TU_CS_MODE_SUB_STREAM; + cs->start = cs->reserved_end = cs->cur = start; + cs->end = end; + cs->refcount_bo = tu_bo_get_ref(suballoc_bo->bo); +} + /** * Finish and release all resources owned by a command stream. */ @@ -66,10 +85,24 @@ tu_cs_finish(struct tu_cs *cs) tu_bo_finish(cs->device, cs->bos[i]); } + if (cs->refcount_bo) + tu_bo_finish(cs->device, cs->refcount_bo); + free(cs->entries); free(cs->bos); } +static struct tu_bo * +tu_cs_current_bo(const struct tu_cs *cs) +{ + if (cs->refcount_bo) { + return cs->refcount_bo; + } else { + assert(cs->bo_count); + return cs->bos[cs->bo_count - 1]; + } +} + /** * Get the offset of the command packets emitted since the last call to * tu_cs_add_entry. @@ -77,8 +110,7 @@ tu_cs_finish(struct tu_cs *cs) static uint32_t tu_cs_get_offset(const struct tu_cs *cs) { - assert(cs->bo_count); - return cs->start - (uint32_t *) cs->bos[cs->bo_count - 1]->map; + return cs->start - (uint32_t *) tu_cs_current_bo(cs)->map; } /* @@ -90,6 +122,8 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size) { /* no BO for TU_CS_MODE_EXTERNAL */ assert(cs->mode != TU_CS_MODE_EXTERNAL); + /* No adding more BOs if suballocating from a suballoc_bo. */ + assert(!cs->refcount_bo); /* no dangling command packet */ assert(tu_cs_is_empty(cs)); @@ -176,7 +210,7 @@ tu_cs_add_entry(struct tu_cs *cs) /* add an entry for [cs->start, cs->cur] */ cs->entries[cs->entry_count++] = (struct tu_cs_entry) { - .bo = cs->bos[cs->bo_count - 1], + .bo = tu_cs_current_bo(cs), .size = tu_cs_get_size(cs) * sizeof(uint32_t), .offset = tu_cs_get_offset(cs) * sizeof(uint32_t), }; @@ -281,7 +315,7 @@ tu_cs_alloc(struct tu_cs *cs, if (result != VK_SUCCESS) return result; - struct tu_bo *bo = cs->bos[cs->bo_count - 1]; + struct tu_bo *bo = tu_cs_current_bo(cs); size_t offset = align(tu_cs_get_offset(cs), size); memory->map = bo->map + offset * sizeof(uint32_t); @@ -303,7 +337,6 @@ struct tu_cs_entry tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs) { assert(cs->mode == TU_CS_MODE_SUB_STREAM); - assert(cs->bo_count); assert(sub_cs->start == cs->cur && sub_cs->end == cs->reserved_end); tu_cs_sanity_check(sub_cs); @@ -312,7 +345,7 @@ tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs) cs->cur = sub_cs->cur; struct tu_cs_entry entry = { - .bo = cs->bos[cs->bo_count - 1], + .bo = tu_cs_current_bo(cs), .size = tu_cs_get_size(cs) * sizeof(uint32_t), .offset = tu_cs_get_offset(cs) * sizeof(uint32_t), }; @@ -397,7 +430,7 @@ void tu_cs_reset(struct tu_cs *cs) { if (cs->mode == TU_CS_MODE_EXTERNAL) { - assert(!cs->bo_count && !cs->entry_count); + assert(!cs->bo_count && !cs->refcount_bo && !cs->entry_count); cs->reserved_end = cs->cur = cs->start; return; } diff --git a/src/freedreno/vulkan/tu_cs.h b/src/freedreno/vulkan/tu_cs.h index 494d9d8fcb5..6e830a907db 100644 --- a/src/freedreno/vulkan/tu_cs.h +++ b/src/freedreno/vulkan/tu_cs.h @@ -39,6 +39,10 @@ void tu_cs_init_external(struct tu_cs *cs, struct tu_device *device, uint32_t *start, uint32_t *end); +void +tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device, + struct tu_suballoc_bo *bo); + void tu_cs_finish(struct tu_cs *cs); diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index b12f49e2465..e2d22892fe2 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1728,6 +1728,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->vk.check_status = tu_device_check_status; mtx_init(&device->bo_mutex, mtx_plain); + mtx_init(&device->pipeline_mutex, mtx_plain); u_rwlock_init(&device->dma_bo_lock); pthread_mutex_init(&device->submit_mutex, NULL); @@ -1786,6 +1787,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, if (custom_border_colors) global_size += TU_BORDER_COLOR_COUNT * sizeof(struct bcolor_entry); + tu_bo_suballocator_init(&device->pipeline_suballoc, device, + 128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP); + result = tu_bo_init_new(device, &device->global_bo, global_size, TU_BO_ALLOC_ALLOW_DUMP); if (result != VK_SUCCESS) { @@ -1987,6 +1991,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) tu_autotune_fini(&device->autotune, device); + tu_bo_suballocator_finish(&device->pipeline_suballoc); + util_sparse_array_finish(&device->bo_map); u_rwlock_destroy(&device->dma_bo_lock); diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index d8cdd1969a5..c3bae9582b9 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -2257,6 +2257,7 @@ tu_pipeline_allocate_cs(struct tu_device *dev, struct tu_pipeline *pipeline, struct tu_pipeline_layout *layout, struct tu_pipeline_builder *builder, + struct tu_pipeline_cache *cache, struct ir3_shader_variant *compute) { uint32_t size = 2048 + tu6_load_state_size(pipeline, layout, compute); @@ -2292,13 +2293,27 @@ tu_pipeline_allocate_cs(struct tu_device *dev, size += tu_xs_get_additional_cs_size_dwords(compute); } - tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size); + /* Allocate the space for the pipeline out of the device's RO suballocator. + * + * Sub-allocating BOs saves memory and also kernel overhead in refcounting of + * BOs at exec time. + * + * The pipeline cache would seem like a natural place to stick the + * suballocator, except that it is not guaranteed to outlive the pipelines + * created from it, so you can't store any long-lived state there, and you + * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because + * pipeline destroy isn't synchronized by the cache. + */ + pthread_mutex_lock(&dev->pipeline_mutex); + VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc, + size * 4, 128); + pthread_mutex_unlock(&dev->pipeline_mutex); + if (result != VK_SUCCESS) + return result; - /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note - * that LOAD_STATE can potentially take up a large amount of space so we - * calculate its size explicitly. - */ - return tu_cs_reserve_space(&pipeline->cs, size); + tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo); + + return VK_SUCCESS; } static void @@ -3259,6 +3274,9 @@ tu_pipeline_finish(struct tu_pipeline *pipeline, const VkAllocationCallbacks *alloc) { tu_cs_finish(&pipeline->cs); + pthread_mutex_lock(&dev->pipeline_mutex); + tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo); + pthread_mutex_unlock(&dev->pipeline_mutex); if (pipeline->pvtmem_bo) tu_bo_finish(dev, pipeline->pvtmem_bo); @@ -3288,7 +3306,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, } result = tu_pipeline_allocate_cs(builder->device, *pipeline, - builder->layout, builder, NULL); + builder->layout, builder, builder->cache, NULL); if (result != VK_SUCCESS) { vk_object_free(&builder->device->vk, builder->alloc, *pipeline); return result; @@ -3340,11 +3358,6 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, tu_pipeline_builder_parse_rasterization_order(builder, *pipeline); tu6_emit_load_state(*pipeline, builder->layout, false); - /* we should have reserved enough space upfront such that the CS never - * grows - */ - assert((*pipeline)->cs.bo_count == 1); - return VK_SUCCESS; } @@ -3492,12 +3505,13 @@ tu_CreateGraphicsPipelines(VkDevice device, static VkResult tu_compute_pipeline_create(VkDevice device, - VkPipelineCache _cache, + VkPipelineCache pipelineCache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline) { TU_FROM_HANDLE(tu_device, dev, device); + TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache); TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout); const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage; VkResult result; @@ -3545,7 +3559,7 @@ tu_compute_pipeline_create(VkDevice device, tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE], shader, v); - result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v); + result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, cache, v); if (result != VK_SUCCESS) goto fail; diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 39c1e695f7f..b992c381cfd 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -297,6 +297,13 @@ bool tu_physical_device_extension_supported(struct tu_physical_device *dev, const char *name); +enum tu_bo_alloc_flags +{ + TU_BO_ALLOC_NO_FLAGS = 0, + TU_BO_ALLOC_ALLOW_DUMP = 1 << 0, + TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1, +}; + struct cache_entry; struct tu_pipeline_cache @@ -379,6 +386,47 @@ struct tu_bo bool implicit_sync : 1; }; +/* externally-synchronized BO suballocator. */ +struct tu_suballocator +{ + struct tu_device *dev; + + uint32_t default_size; + enum tu_bo_alloc_flags flags; + + /** Current BO we're suballocating out of. */ + struct tu_bo *bo; + uint32_t next_offset; + + /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */ + struct tu_bo *cached_bo; +}; + +struct tu_suballoc_bo +{ + struct tu_bo *bo; + uint64_t iova; + uint32_t size; /* bytes */ +}; + +void +tu_bo_suballocator_init(struct tu_suballocator *suballoc, + struct tu_device *dev, + uint32_t default_size, + uint32_t flags); +void +tu_bo_suballocator_finish(struct tu_suballocator *suballoc); + +VkResult +tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc, + uint32_t size, uint32_t align); + +void * +tu_suballoc_bo_map(struct tu_suballoc_bo *bo); + +void +tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo); + enum global_shader { GLOBAL_SH_VS_BLIT, GLOBAL_SH_VS_CLEAR, @@ -460,6 +508,12 @@ struct tu_device uint32_t implicit_sync_bo_count; + /* Device-global BO suballocator for reducing BO management overhead for + * (read-only) pipeline state. Synchronized by pipeline_mutex. + */ + struct tu_suballocator pipeline_suballoc; + mtx_t pipeline_mutex; + /* the blob seems to always use 8K factor and 128K param sizes, copy them */ #define TU_TESS_FACTOR_SIZE (8 * 1024) #define TU_TESS_PARAM_SIZE (128 * 1024) @@ -495,8 +549,9 @@ struct tu_device * freed. Otherwise, if we are not self-importing, we get two different BO * handles, and we want to free each one individually. * - * The BOs in this map all have a refcnt with the reference counter and - * only self-imported BOs will ever have a refcnt > 1. + * The refcount is also useful for being able to maintain BOs across + * VK object lifetimes, such as pipelines suballocating out of BOs + * allocated on the device. */ struct util_sparse_array bo_map; @@ -545,13 +600,6 @@ tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts); VkResult tu_device_check_status(struct vk_device *vk_device); -enum tu_bo_alloc_flags -{ - TU_BO_ALLOC_NO_FLAGS = 0, - TU_BO_ALLOC_ALLOW_DUMP = 1 << 0, - TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1, -}; - VkResult tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size, enum tu_bo_alloc_flags flags); @@ -573,6 +621,13 @@ tu_device_lookup_bo(struct tu_device *device, uint32_t handle) return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle); } +static inline struct tu_bo * +tu_bo_get_ref(struct tu_bo *bo) +{ + p_atomic_inc(&bo->refcnt); + return bo; +} + /* Get a scratch bo for use inside a command buffer. This will always return * the same bo given the same size or similar sizes, so only one scratch bo * can be used at the same time. It's meant for short-lived things where we @@ -695,6 +750,9 @@ struct tu_cs uint32_t bo_count; uint32_t bo_capacity; + /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */ + struct tu_bo *refcount_bo; + /* state for cond_exec_start/cond_exec_end */ uint32_t cond_flags; uint32_t *cond_dwords; @@ -1312,6 +1370,7 @@ struct tu_pipeline struct vk_object_base base; struct tu_cs cs; + struct tu_suballoc_bo bo; /* Separate BO for private memory since it should GPU writable */ struct tu_bo *pvtmem_bo; diff --git a/src/freedreno/vulkan/tu_suballoc.c b/src/freedreno/vulkan/tu_suballoc.c new file mode 100644 index 00000000000..d61edf66ab4 --- /dev/null +++ b/src/freedreno/vulkan/tu_suballoc.c @@ -0,0 +1,143 @@ +/* + * Copyright © 2022 Google LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * Suballocator for space within BOs. + * + * BOs are allocated at PAGE_SIZE (typically 4k) granularity, so small + * allocations are a waste to have in their own BO. Moreover, on DRM we track a + * list of all BOs currently allocated and submit the whole list for validation + * (busy tracking and implicit sync) on every submit, and that validation is a + * non-trivial cost. So, being able to pack multiple allocations into a BO can + * be a significant performance win. + * + * The allocator tracks a current BO it is linearly allocating from, and up to + * one extra BO returned to the pool when all of its previous suballocations + * have been freed. This means that fragmentation can be an issue for + * default_size > PAGE_SIZE and small allocations. Also, excessive BO + * reallocation may happen for workloads where default size < working set size. + */ + +#include "tu_private.h" + +/* Initializes a BO sub-allocator using refcounts on BOs. + */ +void +tu_bo_suballocator_init(struct tu_suballocator *suballoc, + struct tu_device *dev, + uint32_t default_size, uint32_t flags) +{ + suballoc->dev = dev; + suballoc->default_size = default_size; + suballoc->flags = flags; + suballoc->bo = NULL; + suballoc->cached_bo = NULL; +} + +void +tu_bo_suballocator_finish(struct tu_suballocator *suballoc) +{ + if (suballoc->bo) + tu_bo_finish(suballoc->dev, suballoc->bo); + if (suballoc->cached_bo) + tu_bo_finish(suballoc->dev, suballoc->cached_bo); +} + +VkResult +tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, + struct tu_suballocator *suballoc, + uint32_t size, uint32_t align) +{ + struct tu_bo *bo = suballoc->bo; + if (bo) { + uint32_t offset = ALIGN(suballoc->next_offset, align); + if (offset + size <= bo->size) { + suballoc_bo->bo = tu_bo_get_ref(bo); + suballoc_bo->iova = bo->iova + offset; + suballoc_bo->size = size; + + suballoc->next_offset = offset + size; + return VK_SUCCESS; + } else { + tu_bo_finish(suballoc->dev, bo); + suballoc->bo = NULL; + } + } + + uint32_t alloc_size = MAX2(size, suballoc->default_size); + + /* Reuse a recycled suballoc BO if we have one and it's big enough, otherwise free it. */ + if (suballoc->cached_bo) { + if (alloc_size <= suballoc->cached_bo->size) + suballoc->bo = suballoc->cached_bo; + else + tu_bo_finish(suballoc->dev, suballoc->cached_bo); + suballoc->cached_bo = NULL; + } + + /* Allocate the new BO if we didn't have one cached. */ + if (!suballoc->bo) { + VkResult result = tu_bo_init_new(suballoc->dev, &suballoc->bo, + alloc_size, + suballoc->flags); + if (result != VK_SUCCESS) + return result; + } + + VkResult result = tu_bo_map(suballoc->dev, suballoc->bo); + if (result != VK_SUCCESS) { + tu_bo_finish(suballoc->dev, suballoc->bo); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + suballoc_bo->bo = tu_bo_get_ref(suballoc->bo); + suballoc_bo->iova = suballoc_bo->bo->iova; + suballoc_bo->size = size; + suballoc->next_offset = size; + + return VK_SUCCESS; +} + +void +tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo) +{ + if (!bo->bo) + return; + + /* If we we held the last reference to this BO, so just move it to the + * suballocator for the next time we need to allocate. + */ + if (p_atomic_read(&bo->bo->refcnt) == 1 && !suballoc->cached_bo) { + suballoc->cached_bo = bo->bo; + return; + } + + /* Otherwise, drop the refcount on it normally. */ + tu_bo_finish(suballoc->dev, bo->bo); +} + +void * +tu_suballoc_bo_map(struct tu_suballoc_bo *bo) +{ + return bo->bo->map + (bo->iova - bo->bo->iova); +}