tu: Add a "scratch bo" allocation mechanism

This is simpler than a full-blown memory reuse mechanism, but is good
enough to make sure that repeatedly doing a copy that requires the
linear staging buffer workaround won't use excessive memory or be slowed
down due to repeated allocations.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5007>
This commit is contained in:
Connor Abbott 2020-05-11 18:46:04 +02:00 committed by Marge Bot
parent 7ce527a4fe
commit ed79f805fa
2 changed files with 74 additions and 0 deletions

View File

@ -39,6 +39,7 @@
#include "compiler/glsl_types.h"
#include "util/debug.h"
#include "util/disk_cache.h"
#include "util/u_atomic.h"
#include "vk_format.h"
#include "vk_util.h"
@ -1256,6 +1257,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
device->mem_cache = tu_pipeline_cache_from_handle(pc);
for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
*pDevice = tu_device_to_handle(device);
return VK_SUCCESS;
@ -1302,6 +1306,11 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
vk_free(&device->alloc, device->queues[i]);
}
for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) {
if (device->scratch_bos[i].initialized)
tu_bo_finish(device, &device->scratch_bos[i].bo);
}
/* the compiler does not use pAllocator */
ralloc_free(device->compiler);
@ -1311,6 +1320,51 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
vk_free(&device->alloc, device);
}
VkResult
tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo)
{
unsigned size_log2 = MAX2(util_logbase2_ceil64(size), MIN_SCRATCH_BO_SIZE_LOG2);
unsigned index = size_log2 - MIN_SCRATCH_BO_SIZE_LOG2;
assert(index < ARRAY_SIZE(dev->scratch_bos));
for (unsigned i = index; i < ARRAY_SIZE(dev->scratch_bos); i++) {
if (p_atomic_read(&dev->scratch_bos[i].initialized)) {
/* Fast path: just return the already-allocated BO. */
*bo = &dev->scratch_bos[i].bo;
return VK_SUCCESS;
}
}
/* Slow path: actually allocate the BO. We take a lock because the process
* of allocating it is slow, and we don't want to block the CPU while it
* finishes.
*/
mtx_lock(&dev->scratch_bos[index].construct_mtx);
/* Another thread may have allocated it already while we were waiting on
* the lock. We need to check this in order to avoid double-allocating.
*/
if (dev->scratch_bos[index].initialized) {
mtx_unlock(&dev->scratch_bos[index].construct_mtx);
*bo = &dev->scratch_bos[index].bo;
return VK_SUCCESS;
}
unsigned bo_size = 1ull << size_log2;
VkResult result = tu_bo_init_new(dev, &dev->scratch_bos[index].bo, bo_size);
if (result != VK_SUCCESS) {
mtx_unlock(&dev->scratch_bos[index].construct_mtx);
return result;
}
p_atomic_set(&dev->scratch_bos[index].initialized, true);
mtx_unlock(&dev->scratch_bos[index].construct_mtx);
*bo = &dev->scratch_bos[index].bo;
return VK_SUCCESS;
}
VkResult
tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
VkLayerProperties *pProperties)

View File

@ -509,6 +509,17 @@ struct tu_device
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
/* Currently the kernel driver uses a 32-bit GPU address space, but it
* should be impossible to go beyond 48 bits.
*/
struct {
struct tu_bo bo;
mtx_t construct_mtx;
bool initialized;
} scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
struct tu_bo border_color;
struct list_head shader_slabs;
@ -531,6 +542,15 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
VkResult
tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
/* Get a scratch bo for use inside a command buffer. This will always return
* the same bo given the same size or similar sizes, so only one scratch bo
* can be used at the same time. It's meant for short-lived things where we
* need to write to some piece of memory, read from it, and then immediately
* discard it.
*/
VkResult
tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
struct tu_cs_entry
{
/* No ownership */