turnip: Move autotune buffers to suballoc.
Now the ANGLE trex_200 trace replay does a single BO allocation at startup for autotune results instead of one per frame (~350 for the whole replay). Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15038>
This commit is contained in:
parent
7c636acd53
commit
835704e669
|
@ -57,6 +57,9 @@
|
|||
* time, so in most cases there will be no locking.
|
||||
*/
|
||||
|
||||
void
|
||||
tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
|
||||
|
||||
#define TU_AUTOTUNE_DEBUG_LOG 0
|
||||
/* Dump history entries on autotuner finish,
|
||||
* could be used to gather data from traces.
|
||||
|
@ -68,7 +71,6 @@
|
|||
/* For how many submissions we store renderpass stats. */
|
||||
#define MAX_HISTORY_LIFETIME 128
|
||||
|
||||
#define TU_AUTOTUNE_RP_BO_SIZE 4096
|
||||
|
||||
/**
|
||||
* Tracks results for a given renderpass key
|
||||
|
@ -88,62 +90,12 @@ struct tu_renderpass_history {
|
|||
uint32_t avg_samples;
|
||||
};
|
||||
|
||||
struct tu_autotune_results_buffer
|
||||
{
|
||||
int32_t ref_cnt;
|
||||
|
||||
struct tu_device *device;
|
||||
|
||||
/* TODO: It would be better to suballocate the space from
|
||||
* a memory pool which would create less BOs and waste less space.
|
||||
*/
|
||||
struct tu_bo **bos;
|
||||
uint32_t num_bos;
|
||||
uint32_t results_written;
|
||||
};
|
||||
|
||||
static struct tu_autotune_results_buffer*
|
||||
tu_autotune_results_buffer_create(struct tu_device *dev)
|
||||
{
|
||||
struct tu_autotune_results_buffer* buffer =
|
||||
malloc(sizeof(struct tu_autotune_results_buffer));
|
||||
|
||||
buffer->ref_cnt = 1;
|
||||
buffer->device = dev;
|
||||
buffer->results_written = 0;
|
||||
buffer->num_bos = 0;
|
||||
buffer->bos = NULL;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void
|
||||
tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer)
|
||||
{
|
||||
assert(buffer && buffer->ref_cnt >= 1);
|
||||
p_atomic_inc(&buffer->ref_cnt);
|
||||
}
|
||||
|
||||
void
|
||||
tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer)
|
||||
{
|
||||
assert(buffer && buffer->ref_cnt >= 1);
|
||||
if (p_atomic_dec_zero(&buffer->ref_cnt)) {
|
||||
for (int i = 0; i < buffer->num_bos; i++)
|
||||
tu_bo_finish(buffer->device, buffer->bos[i]);
|
||||
|
||||
ralloc_free(buffer->bos);
|
||||
free(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
/* Holds per-submission cs which writes the fence. */
|
||||
struct tu_submission_data {
|
||||
struct list_head node;
|
||||
uint32_t fence;
|
||||
|
||||
struct tu_cs fence_cs;
|
||||
struct tu_autotune_results_buffer **buffers;
|
||||
uint32_t buffers_count;
|
||||
};
|
||||
|
||||
|
@ -175,11 +127,7 @@ free_submission_data(struct tu_submission_data *data)
|
|||
{
|
||||
list_del(&data->node);
|
||||
tu_cs_finish(&data->fence_cs);
|
||||
for (uint32_t i = 0; i < data->buffers_count; i++) {
|
||||
tu_autotune_results_buffer_unref(data->buffers[i]);
|
||||
}
|
||||
|
||||
free(data->buffers);
|
||||
free(data);
|
||||
}
|
||||
|
||||
|
@ -220,16 +168,17 @@ hash_renderpass_instance(const struct tu_render_pass *pass,
|
|||
}
|
||||
|
||||
static void
|
||||
free_result(struct tu_renderpass_result *result)
|
||||
free_result(struct tu_device *dev, struct tu_renderpass_result *result)
|
||||
{
|
||||
tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
|
||||
list_del(&result->node);
|
||||
free(result);
|
||||
}
|
||||
|
||||
static void
|
||||
free_history(struct tu_renderpass_history *history)
|
||||
free_history(struct tu_device *dev, struct tu_renderpass_history *history)
|
||||
{
|
||||
tu_autotune_free_results(&history->results);
|
||||
tu_autotune_free_results_locked(dev, &history->results);
|
||||
free(history);
|
||||
}
|
||||
|
||||
|
@ -266,7 +215,7 @@ create_history_result(struct tu_autotune *at, uint64_t rp_key)
|
|||
}
|
||||
|
||||
static void
|
||||
history_add_result(struct tu_renderpass_history *history,
|
||||
history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
|
||||
struct tu_renderpass_result *result)
|
||||
{
|
||||
list_delinit(&result->node);
|
||||
|
@ -280,7 +229,9 @@ history_add_result(struct tu_renderpass_history *history,
|
|||
*/
|
||||
struct tu_renderpass_result *old_result =
|
||||
list_last_entry(&history->results, struct tu_renderpass_result, node);
|
||||
free_result(old_result);
|
||||
mtx_lock(&dev->autotune_mutex);
|
||||
free_result(dev, old_result);
|
||||
mtx_unlock(&dev->autotune_mutex);
|
||||
}
|
||||
|
||||
/* Do calculations here to avoid locking history in tu_autotune_use_bypass */
|
||||
|
@ -297,7 +248,8 @@ history_add_result(struct tu_renderpass_history *history,
|
|||
static void
|
||||
process_results(struct tu_autotune *at)
|
||||
{
|
||||
struct tu6_global *global = at->device->global_bo->map;
|
||||
struct tu_device *dev = at->device;
|
||||
struct tu6_global *global = dev->global_bo->map;
|
||||
uint32_t current_fence = global->autotune_fence;
|
||||
|
||||
list_for_each_entry_safe(struct tu_renderpass_result, result,
|
||||
|
@ -309,7 +261,7 @@ process_results(struct tu_autotune *at)
|
|||
result->samples_passed =
|
||||
result->samples->samples_end - result->samples->samples_start;
|
||||
|
||||
history_add_result(history, result);
|
||||
history_add_result(dev, history, result);
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(struct tu_submission_data, submission_data,
|
||||
|
@ -338,6 +290,7 @@ queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
|
|||
/* TODO: copying each result isn't nice */
|
||||
struct tu_renderpass_result *copy = malloc(sizeof(*result));
|
||||
*copy = *result;
|
||||
tu_bo_get_ref(copy->bo.bo);
|
||||
list_addtail(©->node, &at->pending_results);
|
||||
}
|
||||
}
|
||||
|
@ -393,19 +346,13 @@ tu_autotune_on_submit(struct tu_device *dev,
|
|||
struct tu_submission_data *submission_data =
|
||||
create_submission_data(dev, at);
|
||||
submission_data->buffers_count = result_buffers;
|
||||
submission_data->buffers =
|
||||
malloc(sizeof(struct tu_autotune_results_buffer *) * result_buffers);
|
||||
|
||||
uint32_t buffer_idx = 0;
|
||||
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
|
||||
struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
|
||||
if (list_is_empty(&cmdbuf->renderpass_autotune_results))
|
||||
continue;
|
||||
|
||||
queue_pending_results(at, cmdbuf);
|
||||
|
||||
submission_data->buffers[buffer_idx++] = cmdbuf->autotune_buffer;
|
||||
tu_autotune_results_buffer_ref(cmdbuf->autotune_buffer);
|
||||
}
|
||||
|
||||
#if TU_AUTOTUNE_DEBUG_LOG != 0
|
||||
|
@ -430,7 +377,9 @@ tu_autotune_on_submit(struct tu_device *dev,
|
|||
_mesa_hash_table_remove_key(at->ht, &history->key);
|
||||
u_rwlock_wrunlock(&at->ht_lock);
|
||||
|
||||
free_history(history);
|
||||
mtx_lock(&dev->autotune_mutex);
|
||||
free_history(dev, history);
|
||||
mtx_unlock(&dev->autotune_mutex);
|
||||
}
|
||||
|
||||
return &submission_data->fence_cs;
|
||||
|
@ -480,12 +429,14 @@ tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
|
|||
}
|
||||
#endif
|
||||
|
||||
tu_autotune_free_results(&at->pending_results);
|
||||
tu_autotune_free_results(dev, &at->pending_results);
|
||||
|
||||
mtx_lock(&dev->autotune_mutex);
|
||||
hash_table_foreach(at->ht, entry) {
|
||||
struct tu_renderpass_history *history = entry->data;
|
||||
free_history(history);
|
||||
free_history(dev, history);
|
||||
}
|
||||
mtx_unlock(&dev->autotune_mutex);
|
||||
|
||||
list_for_each_entry_safe(struct tu_submission_data, submission_data,
|
||||
&at->pending_submission_data, node) {
|
||||
|
@ -510,14 +461,22 @@ tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
|
|||
}
|
||||
|
||||
void
|
||||
tu_autotune_free_results(struct list_head *results)
|
||||
tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
|
||||
{
|
||||
list_for_each_entry_safe(struct tu_renderpass_result, result,
|
||||
results, node) {
|
||||
free_result(result);
|
||||
free_result(dev, result);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
|
||||
{
|
||||
mtx_lock(&dev->autotune_mutex);
|
||||
tu_autotune_free_results_locked(dev, results);
|
||||
mtx_unlock(&dev->autotune_mutex);
|
||||
}
|
||||
|
||||
static bool
|
||||
fallback_use_bypass(const struct tu_render_pass *pass,
|
||||
const struct tu_framebuffer *framebuffer,
|
||||
|
@ -624,32 +583,6 @@ tu_autotune_use_bypass(struct tu_autotune *at,
|
|||
return fallback_use_bypass(pass, framebuffer, cmd_buffer);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
get_offset_for_renderpass(struct tu_autotune_results_buffer *buffer)
|
||||
{
|
||||
uint32_t results_per_bo =
|
||||
TU_AUTOTUNE_RP_BO_SIZE / sizeof(struct tu_renderpass_samples);
|
||||
return (buffer->results_written % results_per_bo) *
|
||||
sizeof(struct tu_renderpass_samples);
|
||||
}
|
||||
|
||||
static struct tu_bo *
|
||||
get_bo_for_renderpass(struct tu_autotune_results_buffer *buffer)
|
||||
{
|
||||
if (get_offset_for_renderpass(buffer) == 0) {
|
||||
buffer->num_bos++;
|
||||
buffer->bos =
|
||||
reralloc(NULL, buffer->bos, struct tu_bo *, buffer->num_bos);
|
||||
struct tu_bo **new_bo = &buffer->bos[buffer->num_bos - 1];
|
||||
|
||||
tu_bo_init_new(buffer->device, new_bo, TU_AUTOTUNE_RP_BO_SIZE,
|
||||
TU_BO_ALLOC_NO_FLAGS);
|
||||
tu_bo_map(buffer->device, *new_bo);
|
||||
}
|
||||
|
||||
return buffer->bos[buffer->num_bos - 1];
|
||||
}
|
||||
|
||||
void
|
||||
tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
|
@ -658,21 +591,21 @@ tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
|
|||
if (!autotune_result)
|
||||
return;
|
||||
|
||||
/* Lazily allocate memory for renderpass results.
|
||||
* Secondary command buffers do not support renderpasses.
|
||||
*/
|
||||
assert(cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
|
||||
if (!cmd->autotune_buffer) {
|
||||
cmd->autotune_buffer = tu_autotune_results_buffer_create(cmd->device);
|
||||
struct tu_device *dev = cmd->device;
|
||||
|
||||
static const uint32_t size = sizeof(struct tu_renderpass_samples);
|
||||
|
||||
mtx_lock(&dev->autotune_mutex);
|
||||
VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
|
||||
mtx_unlock(&dev->autotune_mutex);
|
||||
if (ret != VK_SUCCESS) {
|
||||
autotune_result->bo.iova = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
|
||||
struct tu_bo *bo = get_bo_for_renderpass(cmd->autotune_buffer);
|
||||
uint64_t result_iova = autotune_result->bo.iova;
|
||||
|
||||
uint64_t result_iova = bo->iova + bo_offset;
|
||||
|
||||
autotune_result->samples =
|
||||
(struct tu_renderpass_samples *) (bo->map + bo_offset);
|
||||
autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);
|
||||
|
||||
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
|
||||
|
||||
|
@ -689,11 +622,10 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
|
|||
if (!autotune_result)
|
||||
return;
|
||||
|
||||
uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
|
||||
struct tu_bo *bo = cmd->autotune_buffer->bos[cmd->autotune_buffer->num_bos - 1];
|
||||
cmd->autotune_buffer->results_written += 1;
|
||||
if (!autotune_result->bo.iova)
|
||||
return;
|
||||
|
||||
uint64_t result_iova = bo->iova + bo_offset +
|
||||
uint64_t result_iova = autotune_result->bo.iova +
|
||||
offsetof(struct tu_renderpass_samples, samples_end);
|
||||
|
||||
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
|
||||
|
|
|
@ -32,6 +32,7 @@ struct tu_device;
|
|||
struct tu_cmd_buffer;
|
||||
|
||||
struct tu_renderpass_history;
|
||||
struct tu_renderpass_result;
|
||||
|
||||
/**
|
||||
* "autotune" our decisions about bypass vs GMEM rendering, based on historical
|
||||
|
@ -111,32 +112,13 @@ struct tu_renderpass_samples {
|
|||
uint64_t __pad1;
|
||||
};
|
||||
|
||||
/**
|
||||
* Tracks the results from an individual renderpass. Initially created
|
||||
* per renderpass, and appended to the tail of at->pending_results. At a later
|
||||
* time, when the GPU has finished writing the results, we fill samples_passed.
|
||||
*/
|
||||
struct tu_renderpass_result {
|
||||
/* Points into GPU memory */
|
||||
struct tu_renderpass_samples* samples;
|
||||
|
||||
/*
|
||||
* Below here, only used internally within autotune
|
||||
*/
|
||||
uint64_t rp_key;
|
||||
struct tu_renderpass_history *history;
|
||||
struct list_head node;
|
||||
uint32_t fence;
|
||||
uint64_t samples_passed;
|
||||
};
|
||||
|
||||
VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
|
||||
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
|
||||
|
||||
bool tu_autotune_use_bypass(struct tu_autotune *at,
|
||||
struct tu_cmd_buffer *cmd_buffer,
|
||||
struct tu_renderpass_result **autotune_result);
|
||||
void tu_autotune_free_results(struct list_head *results);
|
||||
void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
|
||||
|
||||
bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
|
||||
uint32_t cmd_buffer_count);
|
||||
|
@ -152,9 +134,6 @@ struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
|
|||
|
||||
struct tu_autotune_results_buffer;
|
||||
|
||||
void tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer);
|
||||
void tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer);
|
||||
|
||||
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
struct tu_renderpass_result *autotune_result);
|
||||
|
|
|
@ -1514,9 +1514,7 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
|
|||
|
||||
u_trace_fini(&cmd_buffer->trace);
|
||||
|
||||
if (cmd_buffer->autotune_buffer)
|
||||
tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
|
||||
tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
|
||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
||||
|
||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||
if (cmd_buffer->descriptors[i].push_set.layout)
|
||||
|
@ -1542,16 +1540,7 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
|
|||
tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
|
||||
tu_cs_reset(&cmd_buffer->sub_cs);
|
||||
|
||||
/* We can't just reset the autotune_buffer's contents, because it is also
|
||||
* referenced by the submission_data if the command buffer was submitted
|
||||
* and we may be accessing it after cmdbuf reset/free.
|
||||
*/
|
||||
if (cmd_buffer->autotune_buffer) {
|
||||
tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
|
||||
cmd_buffer->autotune_buffer = NULL;
|
||||
}
|
||||
|
||||
tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
|
||||
tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
|
||||
|
||||
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
|
||||
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
|
||||
|
|
|
@ -1729,6 +1729,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
|
||||
mtx_init(&device->bo_mutex, mtx_plain);
|
||||
mtx_init(&device->pipeline_mutex, mtx_plain);
|
||||
mtx_init(&device->autotune_mutex, mtx_plain);
|
||||
u_rwlock_init(&device->dma_bo_lock);
|
||||
pthread_mutex_init(&device->submit_mutex, NULL);
|
||||
|
||||
|
@ -1789,6 +1790,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
|
||||
tu_bo_suballocator_init(&device->pipeline_suballoc, device,
|
||||
128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
|
||||
tu_bo_suballocator_init(&device->autotune_suballoc, device,
|
||||
128 * 1024, 0);
|
||||
|
||||
result = tu_bo_init_new(device, &device->global_bo, global_size,
|
||||
TU_BO_ALLOC_ALLOW_DUMP);
|
||||
|
@ -1992,6 +1995,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
|||
tu_autotune_fini(&device->autotune, device);
|
||||
|
||||
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
||||
tu_bo_suballocator_finish(&device->autotune_suballoc);
|
||||
|
||||
util_sparse_array_finish(&device->bo_map);
|
||||
u_rwlock_destroy(&device->dma_bo_lock);
|
||||
|
|
|
@ -438,6 +438,27 @@ enum global_shader {
|
|||
GLOBAL_SH_COUNT,
|
||||
};
|
||||
|
||||
/**
|
||||
* Tracks the results from an individual renderpass. Initially created
|
||||
* per renderpass, and appended to the tail of at->pending_results. At a later
|
||||
* time, when the GPU has finished writing the results, we fill samples_passed.
|
||||
*/
|
||||
struct tu_renderpass_result {
|
||||
/* Points into GPU memory */
|
||||
struct tu_renderpass_samples* samples;
|
||||
|
||||
struct tu_suballoc_bo bo;
|
||||
|
||||
/*
|
||||
* Below here, only used internally within autotune
|
||||
*/
|
||||
uint64_t rp_key;
|
||||
struct tu_renderpass_history *history;
|
||||
struct list_head node;
|
||||
uint32_t fence;
|
||||
uint64_t samples_passed;
|
||||
};
|
||||
|
||||
#define TU_BORDER_COLOR_COUNT 4096
|
||||
#define TU_BORDER_COLOR_BUILTIN 6
|
||||
|
||||
|
@ -514,6 +535,12 @@ struct tu_device
|
|||
struct tu_suballocator pipeline_suballoc;
|
||||
mtx_t pipeline_mutex;
|
||||
|
||||
/* Device-global BO suballocator for reducing BO management for small
|
||||
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
|
||||
*/
|
||||
struct tu_suballocator autotune_suballoc;
|
||||
mtx_t autotune_mutex;
|
||||
|
||||
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
|
||||
#define TU_TESS_FACTOR_SIZE (8 * 1024)
|
||||
#define TU_TESS_PARAM_SIZE (128 * 1024)
|
||||
|
|
Loading…
Reference in New Issue