anv: Use the new common device lost tracking

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13427>
This commit is contained in:
Jason Ekstrand 2021-10-19 18:44:01 -05:00
parent dd89ef96d7
commit 955f329fbe
6 changed files with 38 additions and 144 deletions

View File

@ -2018,9 +2018,9 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
(void *)(uintptr_t) query_info->oa_metrics_set_id);
if (ret < 0) {
result = anv_device_set_lost(device,
"i915-perf config failed: %s",
strerror(errno));
result = vk_device_set_lost(&device->vk,
"i915-perf config failed: %s",
strerror(errno));
}
}
@ -2043,13 +2043,13 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
int ret = queue->device->info.no_hw ? 0 :
anv_gem_execbuffer(queue->device, &query_pass_execbuf);
if (ret)
result = anv_queue_set_lost(queue, "execbuf2 failed: %m");
result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
}
int ret = queue->device->info.no_hw ? 0 :
anv_gem_execbuffer(queue->device, &execbuf.execbuf);
if (ret)
result = anv_queue_set_lost(queue, "execbuf2 failed: %m");
result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
struct drm_i915_gem_exec_object2 *objects = execbuf.objects;
for (uint32_t k = 0; k < execbuf.bo_count; k++) {

View File

@ -3010,7 +3010,6 @@ VkResult anv_CreateDevice(
}
device->physical = physical_device;
device->_lost = false;
/* XXX(chadv): Can we dup() physicalDevice->fd here? */
device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC);
@ -3439,74 +3438,6 @@ VkResult anv_EnumerateInstanceLayerProperties(
return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
}
void
_anv_device_report_lost(struct anv_device *device)
{
assert(p_atomic_read(&device->_lost) > 0);
device->lost_reported = true;
for (uint32_t i = 0; i < device->queue_count; i++) {
struct anv_queue *queue = &device->queues[i];
if (queue->lost) {
__vk_errorf(queue, VK_ERROR_DEVICE_LOST,
queue->error_file, queue->error_line,
"%s", queue->error_msg);
}
}
}
VkResult
_anv_device_set_lost(struct anv_device *device,
const char *file, int line,
const char *msg, ...)
{
VkResult err;
va_list ap;
if (p_atomic_read(&device->_lost) > 0)
return VK_ERROR_DEVICE_LOST;
p_atomic_inc(&device->_lost);
device->lost_reported = true;
va_start(ap, msg);
err = __vk_errorv(device, VK_ERROR_DEVICE_LOST, file, line, msg, ap);
va_end(ap);
if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false))
abort();
return err;
}
VkResult
_anv_queue_set_lost(struct anv_queue *queue,
const char *file, int line,
const char *msg, ...)
{
va_list ap;
if (queue->lost)
return VK_ERROR_DEVICE_LOST;
queue->lost = true;
queue->error_file = file;
queue->error_line = line;
va_start(ap, msg);
vsnprintf(queue->error_msg, sizeof(queue->error_msg),
msg, ap);
va_end(ap);
p_atomic_inc(&queue->device->_lost);
if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false))
abort();
return VK_ERROR_DEVICE_LOST;
}
VkResult
anv_device_query_status(struct anv_device *device)
{
@ -3514,7 +3445,7 @@ anv_device_query_status(struct anv_device *device)
* for it. However, it doesn't hurt to check and it potentially lets us
* avoid an ioctl.
*/
if (anv_device_is_lost(device))
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
uint32_t active, pending;
@ -3522,13 +3453,13 @@ anv_device_query_status(struct anv_device *device)
&active, &pending);
if (ret == -1) {
/* We don't know the real error. */
return anv_device_set_lost(device, "get_reset_stats failed: %m");
return vk_device_set_lost(&device->vk, "get_reset_stats failed: %m");
}
if (active) {
return anv_device_set_lost(device, "GPU hung on one of our command buffers");
return vk_device_set_lost(&device->vk, "GPU hung on one of our command buffers");
} else if (pending) {
return anv_device_set_lost(device, "GPU hung with commands in-flight");
return vk_device_set_lost(&device->vk, "GPU hung with commands in-flight");
}
return VK_SUCCESS;
@ -3546,7 +3477,7 @@ anv_device_bo_busy(struct anv_device *device, struct anv_bo *bo)
return VK_NOT_READY;
} else if (ret == -1) {
/* We don't know the real error. */
return anv_device_set_lost(device, "gem wait failed: %m");
return vk_device_set_lost(&device->vk, "gem wait failed: %m");
}
/* Query for device status after the busy call. If the BO we're checking
@ -3567,7 +3498,7 @@ anv_device_wait(struct anv_device *device, struct anv_bo *bo,
return VK_TIMEOUT;
} else if (ret == -1) {
/* We don't know the real error. */
return anv_device_set_lost(device, "gem wait failed: %m");
return vk_device_set_lost(&device->vk, "gem wait failed: %m");
}
/* Query for device status after the wait. If the BO we're waiting on got
@ -4198,7 +4129,7 @@ VkResult anv_QueueBindSparse(
VkFence fence)
{
ANV_FROM_HANDLE(anv_queue, queue, _queue);
if (anv_device_is_lost(queue->device))
if (vk_device_is_lost(&queue->device->vk))
return VK_ERROR_DEVICE_LOST;
return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
@ -4254,7 +4185,7 @@ VkResult anv_GetEventStatus(
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_event, event, _event);
if (anv_device_is_lost(device))
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
return *(uint64_t *)event->state.map;
@ -4609,8 +4540,8 @@ VkResult anv_GetCalibratedTimestampsEXT(
&pTimestamps[d]);
if (ret != 0) {
return anv_device_set_lost(device, "Failed to read the TIMESTAMP "
"register: %m");
return vk_device_set_lost(&device->vk, "Failed to read the "
"TIMESTAMP register: %m");
}
uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
max_clock_period = MAX2(max_clock_period, device_period);

View File

@ -285,7 +285,7 @@ VkResult anv_QueueSetPerformanceConfigurationINTEL(
int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
(void *)(uintptr_t) config->config_id);
if (ret < 0)
return anv_device_set_lost(device, "i915-perf config failed: %m");
return vk_device_set_lost(&device->vk, "i915-perf config failed: %m");
}
}

View File

@ -1074,15 +1074,6 @@ struct anv_queue {
uint32_t exec_flags;
/* Set once from the device api calls. */
bool lost_signaled;
/* Only set once atomically by the queue */
int lost;
int error_line;
const char * error_file;
char error_msg[80];
/*
* This mutext protects the variables below.
*/
@ -1241,8 +1232,6 @@ struct anv_device {
pthread_mutex_t mutex;
pthread_cond_t queue_submit;
int _lost;
int lost_reported;
struct intel_batch_decode_ctx decoder_ctx;
/*
@ -1322,31 +1311,6 @@ anv_mocs(const struct anv_device *device,
void anv_device_init_blorp(struct anv_device *device);
void anv_device_finish_blorp(struct anv_device *device);
void _anv_device_report_lost(struct anv_device *device);
VkResult _anv_device_set_lost(struct anv_device *device,
const char *file, int line,
const char *msg, ...)
anv_printflike(4, 5);
VkResult _anv_queue_set_lost(struct anv_queue *queue,
const char *file, int line,
const char *msg, ...)
anv_printflike(4, 5);
#define anv_device_set_lost(dev, ...) \
_anv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__)
#define anv_queue_set_lost(queue, ...) \
(queue)->device->has_thread_submit ? \
_anv_queue_set_lost(queue, __FILE__, __LINE__, __VA_ARGS__) : \
_anv_device_set_lost(queue->device, __FILE__, __LINE__, __VA_ARGS__)
static inline bool
anv_device_is_lost(struct anv_device *device)
{
int lost = p_atomic_read(&device->_lost);
if (unlikely(lost && !device->lost_reported))
_anv_device_report_lost(device);
return lost;
}
VkResult anv_device_query_status(struct anv_device *device);

View File

@ -389,7 +389,7 @@ anv_queue_task(void *_queue)
* wakeup the second queue thread first, this would make that execbuf
* fail because the dma-fence it depends on hasn't materialized yet.
*/
if (!queue->lost && submit->wait_timeline_count > 0) {
if (!vk_queue_is_lost(&queue->vk) && submit->wait_timeline_count > 0) {
int ret = queue->device->info.no_hw ? 0 :
anv_gem_syncobj_timeline_wait(
queue->device, submit->wait_timeline_syncobjs,
@ -397,13 +397,13 @@ anv_queue_task(void *_queue)
anv_get_absolute_timeout(UINT64_MAX) /* wait forever */,
true /* wait for all */, true /* wait for materialize */);
if (ret) {
result = anv_queue_set_lost(queue, "timeline timeout: %s",
strerror(errno));
result = vk_queue_set_lost(&queue->vk, "timeline timeout: %s",
strerror(errno));
}
}
/* Now submit */
if (!queue->lost) {
if (!vk_queue_is_lost(&queue->vk)) {
pthread_mutex_lock(&queue->device->mutex);
result = anv_queue_execbuf_locked(queue, submit);
pthread_mutex_unlock(&queue->device->mutex);
@ -459,7 +459,7 @@ anv_queue_submit_post(struct anv_queue *queue,
int ret = pthread_cond_wait(&queue->device->queue_submit,
&queue->device->mutex);
if (ret != 0) {
result = anv_device_set_lost(queue->device, "wait timeout");
result = vk_device_set_lost(&queue->device->vk, "wait timeout");
break;
}
@ -491,7 +491,6 @@ anv_queue_init(struct anv_device *device, struct anv_queue *queue,
queue->family = &pdevice->queue.families[queue->vk.queue_family_index];
queue->exec_flags = exec_flags;
queue->lost = false;
queue->quit = false;
list_inithead(&queue->queued_submits);
@ -800,7 +799,7 @@ anv_queue_submit_simple_batch(struct anv_queue *queue,
if (has_syncobj_wait) {
if (anv_gem_syncobj_wait(device, &syncobj, 1,
anv_get_absolute_timeout(INT64_MAX), true))
result = anv_device_set_lost(device, "anv_gem_syncobj_wait failed: %m");
result = vk_device_set_lost(&device->vk, "anv_gem_syncobj_wait failed: %m");
anv_gem_syncobj_destroy(device, syncobj);
} else {
result = anv_device_wait(device, sync_bo,
@ -1004,8 +1003,8 @@ anv_queue_submit_add_in_semaphore(struct anv_queue *queue,
true /* wait_all */,
true /* wait_materialize */);
if (ret != 0) {
return anv_queue_set_lost(queue,
"unable to wait on syncobj to materialize");
return vk_queue_set_lost(&queue->vk,
"unable to wait on syncobj to materialize");
}
}
@ -1459,7 +1458,7 @@ out:
* anv_device_set_lost() would have been called already by a callee of
* anv_queue_submit().
*/
result = anv_device_set_lost(device, "vkQueueSubmit2KHR() failed");
result = vk_device_set_lost(&device->vk, "vkQueueSubmit2KHR() failed");
}
return result;
@ -1470,7 +1469,7 @@ VkResult anv_QueueWaitIdle(
{
ANV_FROM_HANDLE(anv_queue, queue, _queue);
if (anv_device_is_lost(queue->device))
if (vk_device_is_lost(&queue->device->vk))
return VK_ERROR_DEVICE_LOST;
return anv_queue_submit_simple_batch(queue, NULL);
@ -1626,7 +1625,7 @@ VkResult anv_GetFenceStatus(
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_fence, fence, _fence);
if (anv_device_is_lost(device))
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
struct anv_fence_impl *impl =
@ -1670,7 +1669,7 @@ VkResult anv_GetFenceStatus(
return VK_NOT_READY;
} else {
/* We don't know the real error. */
return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
return vk_device_set_lost(&device->vk, "drm_syncobj_wait failed: %m");
}
} else {
return VK_SUCCESS;
@ -1682,7 +1681,7 @@ VkResult anv_GetFenceStatus(
return VK_NOT_READY;
} else {
/* We don't know the real error. */
return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
return vk_device_set_lost(&device->vk, "drm_syncobj_wait failed: %m");
}
} else {
return VK_SUCCESS;
@ -1737,7 +1736,7 @@ anv_wait_for_syncobj_fences(struct anv_device *device,
return VK_TIMEOUT;
} else {
/* We don't know the real error. */
return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
return vk_device_set_lost(&device->vk, "drm_syncobj_wait failed: %m");
}
} else {
return VK_SUCCESS;
@ -1850,7 +1849,7 @@ anv_wait_for_bo_fences(struct anv_device *device,
}
done:
if (anv_device_is_lost(device))
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
return result;
@ -1953,7 +1952,7 @@ VkResult anv_WaitForFences(
if (device->info.no_hw)
return VK_SUCCESS;
if (anv_device_is_lost(device))
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
uint64_t abs_timeout = anv_get_absolute_timeout(timeout);
@ -2104,7 +2103,7 @@ wait_syncobj_materialize(struct anv_device *device,
anv_get_absolute_timeout(5ull * NSEC_PER_SEC),
true /* wait_all */,
true /* wait_materialize */))
return anv_device_set_lost(device, "anv_gem_syncobj_timeline_wait failed: %m");
return vk_device_set_lost(&device->vk, "anv_gem_syncobj_timeline_wait failed: %m");
return VK_SUCCESS;
}
@ -2555,7 +2554,7 @@ VkResult anv_GetSemaphoreCounterValue(
int ret = anv_gem_syncobj_timeline_query(device, &impl->syncobj, pValue, 1);
if (ret != 0)
return anv_device_set_lost(device, "unable to query timeline syncobj");
return vk_device_set_lost(&device->vk, "unable to query timeline syncobj");
return VK_SUCCESS;
}
@ -2728,7 +2727,7 @@ VkResult anv_WaitSemaphores(
false);
if (ret != 0)
result = errno == ETIME ? VK_TIMEOUT :
anv_device_set_lost(device, "unable to wait on timeline syncobj");
vk_device_set_lost(&device->vk, "unable to wait on timeline syncobj");
} else {
result =
anv_timelines_wait(device, timelines, values, handle_count,
@ -2782,7 +2781,7 @@ VkResult anv_SignalSemaphore(
&pSignalInfo->value, 1);
return ret == 0 ? VK_SUCCESS :
anv_device_set_lost(device, "unable to signal timeline syncobj");
vk_device_set_lost(&device->vk, "unable to signal timeline syncobj");
}
default:

View File

@ -425,7 +425,7 @@ wait_for_available(struct anv_device *device,
return status;
}
return anv_device_set_lost(device, "query timeout");
return vk_device_set_lost(&device->vk, "query timeout");
}
VkResult genX(GetQueryPoolResults)(
@ -448,7 +448,7 @@ VkResult genX(GetQueryPoolResults)(
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
if (anv_device_is_lost(device))
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
if (pData == NULL)