mesa/src/freedreno/drm/freedreno_ringbuffer_sp.c

652 lines
19 KiB
C

/*
* Copyright (C) 2018 Rob Clark <robclark@freedesktop.org>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors:
* Rob Clark <robclark@freedesktop.org>
*/
#include <assert.h>
#include <inttypes.h>
#include <pthread.h>
#include "util/hash_table.h"
#include "util/os_file.h"
#include "util/slab.h"
#include "freedreno_ringbuffer_sp.h"
/* A "softpin" implementation of submit/ringbuffer, which lowers CPU overhead
* by avoiding the additional tracking necessary to build cmds/relocs tables
* (but still builds a bos table)
*/
#define INIT_SIZE 0x1000
#define SUBALLOC_SIZE (32 * 1024)
/* In the pipe->flush() path, we don't have a util_queue_fence we can wait on,
* instead use a condition-variable. Note that pipe->flush() is not expected
* to be a common/hot path.
*/
static pthread_cond_t flush_cnd = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t flush_mtx = PTHREAD_MUTEX_INITIALIZER;
static void finalize_current_cmd(struct fd_ringbuffer *ring);
static struct fd_ringbuffer *
fd_ringbuffer_sp_init(struct fd_ringbuffer_sp *fd_ring, uint32_t size,
enum fd_ringbuffer_flags flags);
/* add (if needed) bo to submit and return index: */
uint32_t
fd_submit_append_bo(struct fd_submit_sp *submit, struct fd_bo *bo)
{
uint32_t idx;
/* NOTE: it is legal to use the same bo on different threads for
* different submits. But it is not legal to use the same submit
* from different threads.
*/
idx = READ_ONCE(bo->idx);
if (unlikely((idx >= submit->nr_bos) || (submit->bos[idx] != bo))) {
uint32_t hash = _mesa_hash_pointer(bo);
struct hash_entry *entry;
entry = _mesa_hash_table_search_pre_hashed(submit->bo_table, hash, bo);
if (entry) {
/* found */
idx = (uint32_t)(uintptr_t)entry->data;
} else {
idx = APPEND(submit, bos, fd_bo_ref(bo));
_mesa_hash_table_insert_pre_hashed(submit->bo_table, hash, bo,
(void *)(uintptr_t)idx);
}
bo->idx = idx;
}
return idx;
}
static void
fd_submit_suballoc_ring_bo(struct fd_submit *submit,
struct fd_ringbuffer_sp *fd_ring, uint32_t size)
{
struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit);
unsigned suballoc_offset = 0;
struct fd_bo *suballoc_bo = NULL;
if (fd_submit->suballoc_ring) {
struct fd_ringbuffer_sp *suballoc_ring =
to_fd_ringbuffer_sp(fd_submit->suballoc_ring);
suballoc_bo = suballoc_ring->ring_bo;
suballoc_offset =
fd_ringbuffer_size(fd_submit->suballoc_ring) + suballoc_ring->offset;
suballoc_offset = align(suballoc_offset, 0x10);
if ((size + suballoc_offset) > suballoc_bo->size) {
suballoc_bo = NULL;
}
}
if (!suballoc_bo) {
// TODO possibly larger size for streaming bo?
fd_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, SUBALLOC_SIZE);
fd_ring->offset = 0;
} else {
fd_ring->ring_bo = fd_bo_ref(suballoc_bo);
fd_ring->offset = suballoc_offset;
}
struct fd_ringbuffer *old_suballoc_ring = fd_submit->suballoc_ring;
fd_submit->suballoc_ring = fd_ringbuffer_ref(&fd_ring->base);
if (old_suballoc_ring)
fd_ringbuffer_del(old_suballoc_ring);
}
static struct fd_ringbuffer *
fd_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size,
enum fd_ringbuffer_flags flags)
{
struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit);
struct fd_ringbuffer_sp *fd_ring;
fd_ring = slab_alloc(&fd_submit->ring_pool);
fd_ring->u.submit = submit;
/* NOTE: needs to be before _suballoc_ring_bo() since it could
* increment the refcnt of the current ring
*/
fd_ring->base.refcnt = 1;
if (flags & FD_RINGBUFFER_STREAMING) {
fd_submit_suballoc_ring_bo(submit, fd_ring, size);
} else {
if (flags & FD_RINGBUFFER_GROWABLE)
size = INIT_SIZE;
fd_ring->offset = 0;
fd_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, size);
}
if (!fd_ringbuffer_sp_init(fd_ring, size, flags))
return NULL;
return &fd_ring->base;
}
/**
* Prepare submit for flush, always done synchronously.
*
* 1) Finalize primary ringbuffer, at this point no more cmdstream may
* be written into it, since from the PoV of the upper level driver
* the submit is flushed, even if deferred
* 2) Add cmdstream bos to bos table
* 3) Update bo fences
*/
static bool
fd_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd,
struct fd_submit_fence *out_fence)
{
struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit);
bool has_shared = false;
finalize_current_cmd(submit->primary);
struct fd_ringbuffer_sp *primary =
to_fd_ringbuffer_sp(submit->primary);
for (unsigned i = 0; i < primary->u.nr_cmds; i++)
fd_submit_append_bo(fd_submit, primary->u.cmds[i].ring_bo);
simple_mtx_lock(&table_lock);
for (unsigned i = 0; i < fd_submit->nr_bos; i++) {
fd_bo_add_fence(fd_submit->bos[i], submit->pipe, submit->fence);
has_shared |= fd_submit->bos[i]->shared;
}
simple_mtx_unlock(&table_lock);
fd_submit->out_fence = out_fence;
fd_submit->in_fence_fd = (in_fence_fd == -1) ?
-1 : os_dupfd_cloexec(in_fence_fd);
return has_shared;
}
static void
fd_submit_sp_flush_execute(void *job, void *gdata, int thread_index)
{
struct fd_submit *submit = job;
struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit);
struct fd_pipe *pipe = submit->pipe;
fd_submit->flush_submit_list(&fd_submit->submit_list);
pthread_mutex_lock(&flush_mtx);
assert(fd_fence_before(pipe->last_submit_fence, fd_submit->base.fence));
pipe->last_submit_fence = fd_submit->base.fence;
pthread_cond_broadcast(&flush_cnd);
pthread_mutex_unlock(&flush_mtx);
DEBUG_MSG("finish: %u", submit->fence);
}
static void
fd_submit_sp_flush_cleanup(void *job, void *gdata, int thread_index)
{
struct fd_submit *submit = job;
fd_submit_del(submit);
}
static int
enqueue_submit_list(struct list_head *submit_list)
{
struct fd_submit *submit = last_submit(submit_list);
struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit);
list_replace(submit_list, &fd_submit->submit_list);
list_inithead(submit_list);
struct util_queue_fence *fence;
if (fd_submit->out_fence) {
fence = &fd_submit->out_fence->ready;
} else {
util_queue_fence_init(&fd_submit->fence);
fence = &fd_submit->fence;
}
DEBUG_MSG("enqueue: %u", submit->fence);
util_queue_add_job(&submit->pipe->dev->submit_queue,
submit, fence,
fd_submit_sp_flush_execute,
fd_submit_sp_flush_cleanup,
0);
return 0;
}
static bool
should_defer(struct fd_submit *submit)
{
struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit);
/* if too many bo's, it may not be worth the CPU cost of submit merging: */
if (fd_submit->nr_bos > 30)
return false;
/* On the kernel side, with 32K ringbuffer, we have an upper limit of 2k
* cmds before we exceed the size of the ringbuffer, which results in
* deadlock writing into the RB (ie. kernel doesn't finish writing into
* the RB so it doesn't kick the GPU to start consuming from the RB)
*/
if (submit->pipe->dev->deferred_cmds > 128)
return false;
return true;
}
static int
fd_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
struct fd_submit_fence *out_fence)
{
struct fd_device *dev = submit->pipe->dev;
struct fd_pipe *pipe = submit->pipe;
/* Acquire lock before flush_prep() because it is possible to race between
* this and pipe->flush():
*/
simple_mtx_lock(&dev->submit_lock);
/* If there are deferred submits from another fd_pipe, flush them now,
* since we can't merge submits from different submitqueue's (ie. they
* could have different priority, etc)
*/
if (!list_is_empty(&dev->deferred_submits) &&
(last_submit(&dev->deferred_submits)->pipe != submit->pipe)) {
struct list_head submit_list;
list_replace(&dev->deferred_submits, &submit_list);
list_inithead(&dev->deferred_submits);
dev->deferred_cmds = 0;
enqueue_submit_list(&submit_list);
}
list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits);
bool has_shared = fd_submit_sp_flush_prep(submit, in_fence_fd, out_fence);
assert(fd_fence_before(pipe->last_enqueue_fence, submit->fence));
pipe->last_enqueue_fence = submit->fence;
/* If we don't need an out-fence, we can defer the submit.
*
* TODO we could defer submits with in-fence as well.. if we took our own
* reference to the fd, and merged all the in-fence-fd's when we flush the
* deferred submits
*/
if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) {
DEBUG_MSG("defer: %u", submit->fence);
dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary);
assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
simple_mtx_unlock(&dev->submit_lock);
return 0;
}
struct list_head submit_list;
list_replace(&dev->deferred_submits, &submit_list);
list_inithead(&dev->deferred_submits);
dev->deferred_cmds = 0;
simple_mtx_unlock(&dev->submit_lock);
return enqueue_submit_list(&submit_list);
}
void
fd_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
{
struct fd_device *dev = pipe->dev;
struct list_head submit_list;
DEBUG_MSG("flush: %u", fence);
list_inithead(&submit_list);
simple_mtx_lock(&dev->submit_lock);
assert(!fd_fence_after(fence, pipe->last_enqueue_fence));
foreach_submit_safe (deferred_submit, &dev->deferred_submits) {
/* We should never have submits from multiple pipes in the deferred
* list. If we did, we couldn't compare their fence to our fence,
* since each fd_pipe is an independent timeline.
*/
if (deferred_submit->pipe != pipe)
break;
if (fd_fence_after(deferred_submit->fence, fence))
break;
list_del(&deferred_submit->node);
list_addtail(&deferred_submit->node, &submit_list);
dev->deferred_cmds -= fd_ringbuffer_cmd_count(deferred_submit->primary);
}
assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
simple_mtx_unlock(&dev->submit_lock);
if (list_is_empty(&submit_list))
goto flush_sync;
enqueue_submit_list(&submit_list);
flush_sync:
/* Once we are sure that we've enqueued at least up to the requested
* submit, we need to be sure that submitq has caught up and flushed
* them to the kernel
*/
pthread_mutex_lock(&flush_mtx);
while (fd_fence_before(pipe->last_submit_fence, fence)) {
pthread_cond_wait(&flush_cnd, &flush_mtx);
}
pthread_mutex_unlock(&flush_mtx);
}
static void
fd_submit_sp_destroy(struct fd_submit *submit)
{
struct fd_submit_sp *fd_submit = to_fd_submit_sp(submit);
if (fd_submit->suballoc_ring)
fd_ringbuffer_del(fd_submit->suballoc_ring);
_mesa_hash_table_destroy(fd_submit->bo_table, NULL);
// TODO it would be nice to have a way to assert() if all
// rb's haven't been free'd back to the slab, because that is
// an indication that we are leaking bo's
slab_destroy_child(&fd_submit->ring_pool);
for (unsigned i = 0; i < fd_submit->nr_bos; i++)
fd_bo_del(fd_submit->bos[i]);
free(fd_submit->bos);
free(fd_submit);
}
static const struct fd_submit_funcs submit_funcs = {
.new_ringbuffer = fd_submit_sp_new_ringbuffer,
.flush = fd_submit_sp_flush,
.destroy = fd_submit_sp_destroy,
};
struct fd_submit *
fd_submit_sp_new(struct fd_pipe *pipe, flush_submit_list_fn flush_submit_list)
{
struct fd_submit_sp *fd_submit = calloc(1, sizeof(*fd_submit));
struct fd_submit *submit;
fd_submit->bo_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
_mesa_key_pointer_equal);
slab_create_child(&fd_submit->ring_pool, &pipe->ring_pool);
fd_submit->flush_submit_list = flush_submit_list;
submit = &fd_submit->base;
submit->funcs = &submit_funcs;
return submit;
}
void
fd_pipe_sp_ringpool_init(struct fd_pipe *pipe)
{
// TODO tune size:
slab_create_parent(&pipe->ring_pool, sizeof(struct fd_ringbuffer_sp), 16);
}
void
fd_pipe_sp_ringpool_fini(struct fd_pipe *pipe)
{
if (pipe->ring_pool.num_elements)
slab_destroy_parent(&pipe->ring_pool);
}
static void
finalize_current_cmd(struct fd_ringbuffer *ring)
{
assert(!(ring->flags & _FD_RINGBUFFER_OBJECT));
struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring);
APPEND(&fd_ring->u, cmds,
(struct fd_cmd_sp){
.ring_bo = fd_bo_ref(fd_ring->ring_bo),
.size = offset_bytes(ring->cur, ring->start),
});
}
static void
fd_ringbuffer_sp_grow(struct fd_ringbuffer *ring, uint32_t size)
{
struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring);
struct fd_pipe *pipe = fd_ring->u.submit->pipe;
assert(ring->flags & FD_RINGBUFFER_GROWABLE);
finalize_current_cmd(ring);
fd_bo_del(fd_ring->ring_bo);
fd_ring->ring_bo = fd_bo_new_ring(pipe->dev, size);
ring->start = fd_bo_map(fd_ring->ring_bo);
ring->end = &(ring->start[size / 4]);
ring->cur = ring->start;
ring->size = size;
}
static inline bool
fd_ringbuffer_references_bo(struct fd_ringbuffer *ring, struct fd_bo *bo)
{
struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring);
for (int i = 0; i < fd_ring->u.nr_reloc_bos; i++) {
if (fd_ring->u.reloc_bos[i] == bo)
return true;
}
return false;
}
#define PTRSZ 64
#include "freedreno_ringbuffer_sp_reloc.h"
#undef PTRSZ
#define PTRSZ 32
#include "freedreno_ringbuffer_sp_reloc.h"
#undef PTRSZ
static uint32_t
fd_ringbuffer_sp_cmd_count(struct fd_ringbuffer *ring)
{
if (ring->flags & FD_RINGBUFFER_GROWABLE)
return to_fd_ringbuffer_sp(ring)->u.nr_cmds + 1;
return 1;
}
static bool
fd_ringbuffer_sp_check_size(struct fd_ringbuffer *ring)
{
assert(!(ring->flags & _FD_RINGBUFFER_OBJECT));
struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring);
struct fd_submit *submit = fd_ring->u.submit;
if (to_fd_submit_sp(submit)->nr_bos > MAX_ARRAY_SIZE/2) {
return false;
}
return true;
}
static void
fd_ringbuffer_sp_destroy(struct fd_ringbuffer *ring)
{
struct fd_ringbuffer_sp *fd_ring = to_fd_ringbuffer_sp(ring);
fd_bo_del(fd_ring->ring_bo);
if (ring->flags & _FD_RINGBUFFER_OBJECT) {
for (unsigned i = 0; i < fd_ring->u.nr_reloc_bos; i++) {
fd_bo_del(fd_ring->u.reloc_bos[i]);
}
free(fd_ring->u.reloc_bos);
free(fd_ring);
} else {
struct fd_submit *submit = fd_ring->u.submit;
for (unsigned i = 0; i < fd_ring->u.nr_cmds; i++) {
fd_bo_del(fd_ring->u.cmds[i].ring_bo);
}
free(fd_ring->u.cmds);
slab_free(&to_fd_submit_sp(submit)->ring_pool, fd_ring);
}
}
static const struct fd_ringbuffer_funcs ring_funcs_nonobj_32 = {
.grow = fd_ringbuffer_sp_grow,
.emit_reloc = fd_ringbuffer_sp_emit_reloc_nonobj_32,
.emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_32,
.cmd_count = fd_ringbuffer_sp_cmd_count,
.check_size = fd_ringbuffer_sp_check_size,
.destroy = fd_ringbuffer_sp_destroy,
};
static const struct fd_ringbuffer_funcs ring_funcs_obj_32 = {
.grow = fd_ringbuffer_sp_grow,
.emit_reloc = fd_ringbuffer_sp_emit_reloc_obj_32,
.emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_32,
.cmd_count = fd_ringbuffer_sp_cmd_count,
.destroy = fd_ringbuffer_sp_destroy,
};
static const struct fd_ringbuffer_funcs ring_funcs_nonobj_64 = {
.grow = fd_ringbuffer_sp_grow,
.emit_reloc = fd_ringbuffer_sp_emit_reloc_nonobj_64,
.emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_64,
.cmd_count = fd_ringbuffer_sp_cmd_count,
.check_size = fd_ringbuffer_sp_check_size,
.destroy = fd_ringbuffer_sp_destroy,
};
static const struct fd_ringbuffer_funcs ring_funcs_obj_64 = {
.grow = fd_ringbuffer_sp_grow,
.emit_reloc = fd_ringbuffer_sp_emit_reloc_obj_64,
.emit_reloc_ring = fd_ringbuffer_sp_emit_reloc_ring_64,
.cmd_count = fd_ringbuffer_sp_cmd_count,
.destroy = fd_ringbuffer_sp_destroy,
};
static inline struct fd_ringbuffer *
fd_ringbuffer_sp_init(struct fd_ringbuffer_sp *fd_ring, uint32_t size,
enum fd_ringbuffer_flags flags)
{
struct fd_ringbuffer *ring = &fd_ring->base;
assert(fd_ring->ring_bo);
uint8_t *base = fd_bo_map(fd_ring->ring_bo);
ring->start = (void *)(base + fd_ring->offset);
ring->end = &(ring->start[size / 4]);
ring->cur = ring->start;
ring->size = size;
ring->flags = flags;
if (flags & _FD_RINGBUFFER_OBJECT) {
if (fd_dev_64b(&fd_ring->u.pipe->dev_id)) {
ring->funcs = &ring_funcs_obj_64;
} else {
ring->funcs = &ring_funcs_obj_32;
}
} else {
if (fd_dev_64b(&fd_ring->u.submit->pipe->dev_id)) {
ring->funcs = &ring_funcs_nonobj_64;
} else {
ring->funcs = &ring_funcs_nonobj_32;
}
}
// TODO initializing these could probably be conditional on flags
// since unneed for FD_RINGBUFFER_STAGING case..
fd_ring->u.cmds = NULL;
fd_ring->u.nr_cmds = fd_ring->u.max_cmds = 0;
fd_ring->u.reloc_bos = NULL;
fd_ring->u.nr_reloc_bos = fd_ring->u.max_reloc_bos = 0;
return ring;
}
struct fd_ringbuffer *
fd_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size)
{
struct fd_device *dev = pipe->dev;
struct fd_ringbuffer_sp *fd_ring = malloc(sizeof(*fd_ring));
/* Lock access to the fd_pipe->suballoc_* since ringbuffer object allocation
* can happen both on the frontend (most CSOs) and the driver thread (a6xx
* cached tex state, for example)
*/
simple_mtx_lock(&dev->suballoc_lock);
/* Maximum known alignment requirement is a6xx's TEX_CONST at 16 dwords */
fd_ring->offset = align(dev->suballoc_offset, 64);
if (!dev->suballoc_bo ||
fd_ring->offset + size > fd_bo_size(dev->suballoc_bo)) {
if (dev->suballoc_bo)
fd_bo_del(dev->suballoc_bo);
dev->suballoc_bo =
fd_bo_new_ring(dev, MAX2(SUBALLOC_SIZE, align(size, 4096)));
fd_ring->offset = 0;
}
fd_ring->u.pipe = pipe;
fd_ring->ring_bo = fd_bo_ref(dev->suballoc_bo);
fd_ring->base.refcnt = 1;
dev->suballoc_offset = fd_ring->offset + size;
simple_mtx_unlock(&dev->suballoc_lock);
return fd_ringbuffer_sp_init(fd_ring, size, _FD_RINGBUFFER_OBJECT);
}