mesa/src/virtio/vulkan/vn_ring.c

753 lines
22 KiB
C

/*
* Copyright 2021 Google LLC
* SPDX-License-Identifier: MIT
*/
#include "vn_ring.h"
#include <sys/resource.h>
#include "venus-protocol/vn_protocol_driver_transport.h"
#include "vn_cs.h"
#include "vn_instance.h"
#include "vn_renderer.h"
#define VN_RING_IDLE_TIMEOUT_NS (1ull * 1000 * 1000)
static_assert(ATOMIC_INT_LOCK_FREE == 2 && sizeof(atomic_uint) == 4,
"vn_ring_shared requires lock-free 32-bit atomic_uint");
/* pointers to a ring in a BO */
struct vn_ring_shared {
const volatile atomic_uint *head;
volatile atomic_uint *tail;
volatile atomic_uint *status;
void *buffer;
void *extra;
};
struct vn_ring {
uint64_t id;
struct vn_instance *instance;
struct vn_renderer_shmem *shmem;
uint32_t buffer_size;
uint32_t buffer_mask;
struct vn_ring_shared shared;
uint32_t cur;
/* This mutex ensures below:
* - atomic of ring submission
* - reply shmem resource set and ring submission are paired
*/
mtx_t mutex;
/* size limit for cmd submission via ring shmem, derived from
* (buffer_size >> direct_order) upon vn_ring_create
*/
uint32_t direct_size;
/* used for indirect submission of large command (non-VkCommandBuffer) */
struct vn_cs_encoder upload;
struct list_head submits;
struct list_head free_submits;
/* to synchronize renderer/ring */
mtx_t roundtrip_mutex;
uint64_t roundtrip_next;
int64_t last_notify;
int64_t next_notify;
};
struct vn_ring_submit {
uint32_t seqno;
struct list_head head;
/* BOs to keep alive (TODO make sure shmems are pinned) */
uint32_t shmem_count;
struct vn_renderer_shmem *shmems[];
};
struct vn_ring_submission {
const struct vn_cs_encoder *cs;
struct vn_ring_submit *submit;
struct {
struct vn_cs_encoder cs;
struct vn_cs_encoder_buffer buffer;
uint32_t data[64];
} indirect;
};
static uint32_t
vn_ring_load_head(const struct vn_ring *ring)
{
/* the renderer is expected to store the head with memory_order_release,
* forming a release-acquire ordering
*/
return atomic_load_explicit(ring->shared.head, memory_order_acquire);
}
static void
vn_ring_store_tail(struct vn_ring *ring)
{
/* the renderer is expected to load the tail with memory_order_acquire,
* forming a release-acquire ordering
*/
return atomic_store_explicit(ring->shared.tail, ring->cur,
memory_order_release);
}
uint32_t
vn_ring_load_status(const struct vn_ring *ring)
{
/* must be called and ordered after vn_ring_store_tail for idle status */
return atomic_load_explicit(ring->shared.status, memory_order_seq_cst);
}
void
vn_ring_unset_status_bits(struct vn_ring *ring, uint32_t mask)
{
atomic_fetch_and_explicit(ring->shared.status, ~mask,
memory_order_seq_cst);
}
static void
vn_ring_write_buffer(struct vn_ring *ring, const void *data, uint32_t size)
{
assert(ring->cur + size - vn_ring_load_head(ring) <= ring->buffer_size);
const uint32_t offset = ring->cur & ring->buffer_mask;
if (offset + size <= ring->buffer_size) {
memcpy(ring->shared.buffer + offset, data, size);
} else {
const uint32_t s = ring->buffer_size - offset;
memcpy(ring->shared.buffer + offset, data, s);
memcpy(ring->shared.buffer, data + s, size - s);
}
ring->cur += size;
}
static bool
vn_ring_ge_seqno(const struct vn_ring *ring, uint32_t a, uint32_t b)
{
/* this can return false negative when not called fast enough (e.g., when
* called once every couple hours), but following calls with larger a's
* will correct itself
*
* TODO use real seqnos?
*/
if (a >= b)
return ring->cur >= a || ring->cur < b;
else
return ring->cur >= a && ring->cur < b;
}
static void
vn_ring_retire_submits(struct vn_ring *ring, uint32_t seqno)
{
struct vn_renderer *renderer = ring->instance->renderer;
list_for_each_entry_safe(struct vn_ring_submit, submit, &ring->submits,
head) {
if (!vn_ring_ge_seqno(ring, seqno, submit->seqno))
break;
for (uint32_t i = 0; i < submit->shmem_count; i++)
vn_renderer_shmem_unref(renderer, submit->shmems[i]);
list_move_to(&submit->head, &ring->free_submits);
}
}
bool
vn_ring_get_seqno_status(struct vn_ring *ring, uint32_t seqno)
{
return vn_ring_ge_seqno(ring, vn_ring_load_head(ring), seqno);
}
static void
vn_ring_wait_seqno(struct vn_ring *ring, uint32_t seqno)
{
/* A renderer wait incurs several hops and the renderer might poll
* repeatedly anyway. Let's just poll here.
*/
const enum vn_relax_reason reason = ring == ring->instance->ring.ring
? VN_RELAX_REASON_RING_SEQNO
: VN_RELAX_REASON_TLS_RING_SEQNO;
struct vn_relax_state relax_state = vn_relax_init(ring->instance, reason);
do {
if (vn_ring_get_seqno_status(ring, seqno)) {
vn_relax_fini(&relax_state);
return;
}
vn_relax(&relax_state);
} while (true);
}
void
vn_ring_wait_all(struct vn_ring *ring)
{
/* load from tail rather than ring->cur for atomicity */
const uint32_t pending_seqno =
atomic_load_explicit(ring->shared.tail, memory_order_relaxed);
vn_ring_wait_seqno(ring, pending_seqno);
}
static bool
vn_ring_has_space(const struct vn_ring *ring,
uint32_t size,
uint32_t *out_head)
{
const uint32_t head = vn_ring_load_head(ring);
if (likely(ring->cur + size - head <= ring->buffer_size)) {
*out_head = head;
return true;
}
return false;
}
static uint32_t
vn_ring_wait_space(struct vn_ring *ring, uint32_t size)
{
assert(size <= ring->buffer_size);
uint32_t head;
if (likely(vn_ring_has_space(ring, size, &head)))
return head;
{
VN_TRACE_FUNC();
/* see the reasoning in vn_ring_wait_seqno */
struct vn_relax_state relax_state =
vn_relax_init(ring->instance, VN_RELAX_REASON_RING_SPACE);
do {
vn_relax(&relax_state);
if (vn_ring_has_space(ring, size, &head)) {
vn_relax_fini(&relax_state);
return head;
}
} while (true);
}
}
void
vn_ring_get_layout(size_t buf_size,
size_t extra_size,
struct vn_ring_layout *layout)
{
/* this can be changed/extended quite freely */
struct layout {
alignas(64) uint32_t head;
alignas(64) uint32_t tail;
alignas(64) uint32_t status;
alignas(64) uint8_t buffer[];
};
assert(buf_size && util_is_power_of_two_or_zero(buf_size));
layout->head_offset = offsetof(struct layout, head);
layout->tail_offset = offsetof(struct layout, tail);
layout->status_offset = offsetof(struct layout, status);
layout->buffer_offset = offsetof(struct layout, buffer);
layout->buffer_size = buf_size;
layout->extra_offset = layout->buffer_offset + layout->buffer_size;
layout->extra_size = extra_size;
layout->shmem_size = layout->extra_offset + layout->extra_size;
}
struct vn_ring *
vn_ring_create(struct vn_instance *instance,
const struct vn_ring_layout *layout,
uint8_t direct_order)
{
VN_TRACE_FUNC();
const VkAllocationCallbacks *alloc = &instance->base.base.alloc;
struct vn_ring *ring = vk_zalloc(alloc, sizeof(*ring), VN_DEFAULT_ALIGN,
VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
if (!ring)
return NULL;
ring->id = (uintptr_t)ring;
ring->instance = instance;
ring->shmem =
vn_renderer_shmem_create(instance->renderer, layout->shmem_size);
if (!ring->shmem) {
if (VN_DEBUG(INIT))
vn_log(instance, "failed to allocate/map ring shmem");
vk_free(alloc, ring);
return NULL;
}
void *shared = ring->shmem->mmap_ptr;
memset(shared, 0, layout->shmem_size);
assert(layout->buffer_size &&
util_is_power_of_two_or_zero(layout->buffer_size));
ring->buffer_size = layout->buffer_size;
ring->buffer_mask = ring->buffer_size - 1;
ring->shared.head = shared + layout->head_offset;
ring->shared.tail = shared + layout->tail_offset;
ring->shared.status = shared + layout->status_offset;
ring->shared.buffer = shared + layout->buffer_offset;
ring->shared.extra = shared + layout->extra_offset;
mtx_init(&ring->mutex, mtx_plain);
ring->direct_size = layout->buffer_size >> direct_order;
assert(ring->direct_size);
vn_cs_encoder_init(&ring->upload, instance,
VN_CS_ENCODER_STORAGE_SHMEM_ARRAY, 1 * 1024 * 1024);
list_inithead(&ring->submits);
list_inithead(&ring->free_submits);
mtx_init(&ring->roundtrip_mutex, mtx_plain);
ring->roundtrip_next = 1;
/* VkRingPriorityInfoMESA support requires
* VK_MESA_VENUS_PROTOCOL_SPEC_VERSION >= 2 */
int prio = 0;
bool ring_priority = false;
if (instance->renderer->info.vk_mesa_venus_protocol_spec_version >= 2) {
errno = 0;
prio = getpriority(PRIO_PROCESS, 0);
ring_priority = !(prio == -1 && errno);
}
const struct VkRingPriorityInfoMESA priority_info = {
.sType = VK_STRUCTURE_TYPE_RING_PRIORITY_INFO_MESA,
.priority = prio,
};
const struct VkRingMonitorInfoMESA monitor_info = {
.sType = VK_STRUCTURE_TYPE_RING_MONITOR_INFO_MESA,
.pNext = ring_priority ? &priority_info : NULL,
.maxReportingPeriodMicroseconds = VN_WATCHDOG_REPORT_PERIOD_US,
};
const struct VkRingCreateInfoMESA info = {
.sType = VK_STRUCTURE_TYPE_RING_CREATE_INFO_MESA,
.pNext = &monitor_info,
.resourceId = ring->shmem->res_id,
.size = layout->shmem_size,
.idleTimeout = VN_RING_IDLE_TIMEOUT_NS,
.headOffset = layout->head_offset,
.tailOffset = layout->tail_offset,
.statusOffset = layout->status_offset,
.bufferOffset = layout->buffer_offset,
.bufferSize = layout->buffer_size,
.extraOffset = layout->extra_offset,
.extraSize = layout->extra_size,
};
uint32_t create_ring_data[64];
struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL(
create_ring_data, sizeof(create_ring_data));
vn_encode_vkCreateRingMESA(&local_enc, 0, ring->id, &info);
vn_renderer_submit_simple(instance->renderer, create_ring_data,
vn_cs_encoder_get_len(&local_enc));
return ring;
}
void
vn_ring_destroy(struct vn_ring *ring)
{
VN_TRACE_FUNC();
const VkAllocationCallbacks *alloc = &ring->instance->base.base.alloc;
uint32_t destroy_ring_data[4];
struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL(
destroy_ring_data, sizeof(destroy_ring_data));
vn_encode_vkDestroyRingMESA(&local_enc, 0, ring->id);
vn_renderer_submit_simple(ring->instance->renderer, destroy_ring_data,
vn_cs_encoder_get_len(&local_enc));
mtx_destroy(&ring->roundtrip_mutex);
vn_ring_retire_submits(ring, ring->cur);
assert(list_is_empty(&ring->submits));
list_for_each_entry_safe(struct vn_ring_submit, submit,
&ring->free_submits, head)
free(submit);
vn_cs_encoder_fini(&ring->upload);
vn_renderer_shmem_unref(ring->instance->renderer, ring->shmem);
mtx_destroy(&ring->mutex);
vk_free(alloc, ring);
}
uint64_t
vn_ring_get_id(struct vn_ring *ring)
{
return ring->id;
}
static struct vn_ring_submit *
vn_ring_get_submit(struct vn_ring *ring, uint32_t shmem_count)
{
const uint32_t min_shmem_count = 2;
struct vn_ring_submit *submit;
/* TODO this could be simplified if we could omit shmem_count */
if (shmem_count <= min_shmem_count &&
!list_is_empty(&ring->free_submits)) {
submit =
list_first_entry(&ring->free_submits, struct vn_ring_submit, head);
list_del(&submit->head);
} else {
const size_t submit_size = offsetof(
struct vn_ring_submit, shmems[MAX2(shmem_count, min_shmem_count)]);
submit = malloc(submit_size);
}
return submit;
}
static bool
vn_ring_submit_internal(struct vn_ring *ring,
struct vn_ring_submit *submit,
const struct vn_cs_encoder *cs,
uint32_t *seqno)
{
/* write cs to the ring */
assert(!vn_cs_encoder_is_empty(cs));
/* avoid -Wmaybe-unitialized */
uint32_t cur_seqno = 0;
for (uint32_t i = 0; i < cs->buffer_count; i++) {
const struct vn_cs_encoder_buffer *buf = &cs->buffers[i];
cur_seqno = vn_ring_wait_space(ring, buf->committed_size);
vn_ring_write_buffer(ring, buf->base, buf->committed_size);
}
vn_ring_store_tail(ring);
const VkRingStatusFlagsMESA status = vn_ring_load_status(ring);
if (status & VK_RING_STATUS_FATAL_BIT_MESA) {
vn_log(NULL, "vn_ring_submit abort on fatal");
abort();
}
vn_ring_retire_submits(ring, cur_seqno);
submit->seqno = ring->cur;
list_addtail(&submit->head, &ring->submits);
*seqno = submit->seqno;
/* Notify renderer to wake up idle ring if at least VN_RING_IDLE_TIMEOUT_NS
* has passed since the last sent notification to avoid excessive wake up
* calls (non-trivial since submitted via virtio-gpu kernel).
*/
if (status & VK_RING_STATUS_IDLE_BIT_MESA) {
const int64_t now = os_time_get_nano();
if (os_time_timeout(ring->last_notify, ring->next_notify, now)) {
ring->last_notify = now;
ring->next_notify = now + VN_RING_IDLE_TIMEOUT_NS;
return true;
}
}
return false;
}
static const struct vn_cs_encoder *
vn_ring_submission_get_cs(struct vn_ring_submission *submit,
const struct vn_cs_encoder *cs,
bool direct)
{
if (direct)
return cs;
STACK_ARRAY(VkCommandStreamDescriptionMESA, descs, cs->buffer_count);
uint32_t desc_count = 0;
for (uint32_t i = 0; i < cs->buffer_count; i++) {
const struct vn_cs_encoder_buffer *buf = &cs->buffers[i];
if (buf->committed_size) {
descs[desc_count++] = (VkCommandStreamDescriptionMESA){
.resourceId = buf->shmem->res_id,
.offset = buf->offset,
.size = buf->committed_size,
};
}
}
const size_t exec_size = vn_sizeof_vkExecuteCommandStreamsMESA(
desc_count, descs, NULL, 0, NULL, 0);
void *exec_data = submit->indirect.data;
if (exec_size > sizeof(submit->indirect.data)) {
exec_data = malloc(exec_size);
if (!exec_data) {
STACK_ARRAY_FINISH(descs);
return NULL;
}
}
submit->indirect.buffer = VN_CS_ENCODER_BUFFER_INITIALIZER(exec_data);
submit->indirect.cs =
VN_CS_ENCODER_INITIALIZER(&submit->indirect.buffer, exec_size);
vn_encode_vkExecuteCommandStreamsMESA(&submit->indirect.cs, 0, desc_count,
descs, NULL, 0, NULL, 0);
vn_cs_encoder_commit(&submit->indirect.cs);
STACK_ARRAY_FINISH(descs);
return &submit->indirect.cs;
}
static struct vn_ring_submit *
vn_ring_submission_get_ring_submit(struct vn_ring *ring,
const struct vn_cs_encoder *cs,
struct vn_renderer_shmem *extra_shmem,
bool direct)
{
struct vn_renderer *renderer = ring->instance->renderer;
const uint32_t shmem_count =
(direct ? 0 : cs->buffer_count) + (extra_shmem ? 1 : 0);
struct vn_ring_submit *submit = vn_ring_get_submit(ring, shmem_count);
if (!submit)
return NULL;
submit->shmem_count = shmem_count;
if (!direct) {
for (uint32_t i = 0; i < cs->buffer_count; i++) {
submit->shmems[i] =
vn_renderer_shmem_ref(renderer, cs->buffers[i].shmem);
}
}
if (extra_shmem) {
submit->shmems[shmem_count - 1] =
vn_renderer_shmem_ref(renderer, extra_shmem);
}
return submit;
}
static inline void
vn_ring_submission_cleanup(struct vn_ring_submission *submit)
{
if (submit->cs == &submit->indirect.cs &&
submit->indirect.buffer.base != submit->indirect.data)
free(submit->indirect.buffer.base);
}
static VkResult
vn_ring_submission_prepare(struct vn_ring *ring,
struct vn_ring_submission *submit,
const struct vn_cs_encoder *cs,
struct vn_renderer_shmem *extra_shmem,
bool direct)
{
submit->cs = vn_ring_submission_get_cs(submit, cs, direct);
if (!submit->cs)
return VK_ERROR_OUT_OF_HOST_MEMORY;
submit->submit =
vn_ring_submission_get_ring_submit(ring, cs, extra_shmem, direct);
if (!submit->submit) {
vn_ring_submission_cleanup(submit);
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
return VK_SUCCESS;
}
static inline bool
vn_ring_submission_can_direct(const struct vn_ring *ring,
const struct vn_cs_encoder *cs)
{
return vn_cs_encoder_get_len(cs) <= ring->direct_size;
}
static struct vn_cs_encoder *
vn_ring_cs_upload_locked(struct vn_ring *ring, const struct vn_cs_encoder *cs)
{
VN_TRACE_FUNC();
assert(cs->storage_type == VN_CS_ENCODER_STORAGE_POINTER &&
cs->buffer_count == 1);
const void *cs_data = cs->buffers[0].base;
const size_t cs_size = cs->total_committed_size;
assert(cs_size == vn_cs_encoder_get_len(cs));
struct vn_cs_encoder *upload = &ring->upload;
vn_cs_encoder_reset(upload);
if (!vn_cs_encoder_reserve(upload, cs_size))
return NULL;
vn_cs_encoder_write(upload, cs_size, cs_data, cs_size);
vn_cs_encoder_commit(upload);
if (vn_cs_encoder_needs_roundtrip(upload))
vn_ring_roundtrip(ring);
return upload;
}
static VkResult
vn_ring_submit_locked(struct vn_ring *ring,
const struct vn_cs_encoder *cs,
struct vn_renderer_shmem *extra_shmem,
uint32_t *ring_seqno)
{
const bool direct = vn_ring_submission_can_direct(ring, cs);
if (!direct && cs->storage_type == VN_CS_ENCODER_STORAGE_POINTER) {
cs = vn_ring_cs_upload_locked(ring, cs);
if (!cs)
return VK_ERROR_OUT_OF_HOST_MEMORY;
assert(cs->storage_type != VN_CS_ENCODER_STORAGE_POINTER);
}
struct vn_ring_submission submit;
VkResult result =
vn_ring_submission_prepare(ring, &submit, cs, extra_shmem, direct);
if (result != VK_SUCCESS)
return result;
uint32_t seqno;
const bool notify =
vn_ring_submit_internal(ring, submit.submit, submit.cs, &seqno);
if (notify) {
uint32_t notify_ring_data[8];
struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL(
notify_ring_data, sizeof(notify_ring_data));
vn_encode_vkNotifyRingMESA(&local_enc, 0, ring->id, seqno, 0);
vn_renderer_submit_simple(ring->instance->renderer, notify_ring_data,
vn_cs_encoder_get_len(&local_enc));
}
vn_ring_submission_cleanup(&submit);
if (ring_seqno)
*ring_seqno = seqno;
return VK_SUCCESS;
}
VkResult
vn_ring_submit_command_simple(struct vn_ring *ring,
const struct vn_cs_encoder *cs)
{
mtx_lock(&ring->mutex);
VkResult result = vn_ring_submit_locked(ring, cs, NULL, NULL);
mtx_unlock(&ring->mutex);
return result;
}
static inline void
vn_ring_set_reply_shmem_locked(struct vn_ring *ring,
struct vn_renderer_shmem *shmem,
size_t offset,
size_t size)
{
uint32_t set_reply_command_stream_data[16];
struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL(
set_reply_command_stream_data, sizeof(set_reply_command_stream_data));
const struct VkCommandStreamDescriptionMESA stream = {
.resourceId = shmem->res_id,
.offset = offset,
.size = size,
};
vn_encode_vkSetReplyCommandStreamMESA(&local_enc, 0, &stream);
vn_cs_encoder_commit(&local_enc);
vn_ring_submit_locked(ring, &local_enc, NULL, NULL);
}
void
vn_ring_submit_command(struct vn_ring *ring,
struct vn_ring_submit_command *submit)
{
assert(!vn_cs_encoder_is_empty(&submit->command));
vn_cs_encoder_commit(&submit->command);
size_t reply_offset = 0;
if (submit->reply_size) {
submit->reply_shmem = vn_instance_reply_shmem_alloc(
ring->instance, submit->reply_size, &reply_offset);
if (!submit->reply_shmem)
return;
if (ring->instance->renderer->info.has_guest_vram &&
!submit->reply_shmem->cache_timestamp)
vn_ring_roundtrip(ring);
}
mtx_lock(&ring->mutex);
if (submit->reply_size) {
vn_ring_set_reply_shmem_locked(ring, submit->reply_shmem, reply_offset,
submit->reply_size);
}
submit->ring_seqno_valid =
VK_SUCCESS == vn_ring_submit_locked(ring, &submit->command,
submit->reply_shmem,
&submit->ring_seqno);
mtx_unlock(&ring->mutex);
if (submit->reply_size) {
if (likely(submit->ring_seqno_valid)) {
void *reply_ptr = submit->reply_shmem->mmap_ptr + reply_offset;
submit->reply =
VN_CS_DECODER_INITIALIZER(reply_ptr, submit->reply_size);
vn_ring_wait_seqno(ring, submit->ring_seqno);
} else {
vn_renderer_shmem_unref(ring->instance->renderer,
submit->reply_shmem);
submit->reply_shmem = NULL;
}
}
}
void
vn_ring_free_command_reply(struct vn_ring *ring,
struct vn_ring_submit_command *submit)
{
assert(submit->reply_shmem);
vn_renderer_shmem_unref(ring->instance->renderer, submit->reply_shmem);
}
VkResult
vn_ring_submit_roundtrip(struct vn_ring *ring, uint64_t *roundtrip_seqno)
{
uint32_t local_data[8];
struct vn_cs_encoder local_enc =
VN_CS_ENCODER_INITIALIZER_LOCAL(local_data, sizeof(local_data));
mtx_lock(&ring->roundtrip_mutex);
const uint64_t seqno = ring->roundtrip_next++;
vn_encode_vkSubmitVirtqueueSeqnoMESA(&local_enc, 0, ring->id, seqno);
VkResult result =
vn_renderer_submit_simple(ring->instance->renderer, local_data,
vn_cs_encoder_get_len(&local_enc));
mtx_unlock(&ring->roundtrip_mutex);
*roundtrip_seqno = seqno;
return result;
}
void
vn_ring_wait_roundtrip(struct vn_ring *ring, uint64_t roundtrip_seqno)
{
vn_async_vkWaitVirtqueueSeqnoMESA(ring, roundtrip_seqno);
}