radv: dump VA ranges history when a GPU hang is detected

This is enabled only with RADV_DEBUG=hang. This adds a small

Gitlab: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3904
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7891>
This commit is contained in:
Samuel Pitoiset 2020-12-11 15:43:51 +01:00
parent 15e7e6443c
commit 6ed4332591
5 changed files with 83 additions and 0 deletions

View File

@ -734,6 +734,14 @@ radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_cmdbuf *cs)
fclose(f);
}
/* Dump BO log. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "bo_history.log");
f = fopen(dump_path, "w+");
if (f) {
device->ws->dump_bo_log(device->ws, f);
fclose(f);
}
/* Dump VM fault info. */
if (vm_fault_occurred) {
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "vm_fault.log");

View File

@ -301,6 +301,8 @@ struct radeon_winsys {
void (*dump_bo_ranges)(struct radeon_winsys *ws, FILE *file);
void (*dump_bo_log)(struct radeon_winsys *ws, FILE *file);
int (*surface_init)(struct radeon_winsys *ws,
const struct ac_surf_info *surf_info,
struct radeon_surf *surf);

View File

@ -39,6 +39,7 @@
#include "util/u_atomic.h"
#include "util/u_memory.h"
#include "util/u_math.h"
#include "util/os_time.h"
static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo);
@ -282,6 +283,39 @@ radv_amdgpu_winsys_bo_virtual_bind(struct radeon_winsys_bo *_parent,
return VK_SUCCESS;
}
struct radv_amdgpu_winsys_bo_log {
struct list_head list;
uint64_t va;
uint64_t size;
uint64_t timestamp; /* CPU timestamp */
uint8_t is_virtual : 1;
uint8_t destroyed : 1;
};
static void radv_amdgpu_log_bo(struct radv_amdgpu_winsys_bo *bo,
bool destroyed)
{
struct radv_amdgpu_winsys *ws = bo->ws;
struct radv_amdgpu_winsys_bo_log *bo_log = NULL;
if (!bo->ws->debug_log_bos)
return;
bo_log = malloc(sizeof(*bo_log));
if (!bo_log)
return;
bo_log->va = bo->base.va;
bo_log->size = bo->size;
bo_log->timestamp = os_time_get_nano();
bo_log->is_virtual = bo->is_virtual;
bo_log->destroyed = destroyed;
u_rwlock_wrlock(&ws->log_bo_list_lock);
list_addtail(&bo_log->list, &ws->log_bo_list);
u_rwlock_wrunlock(&ws->log_bo_list_lock);
}
static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo)
{
struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
@ -289,6 +323,9 @@ static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo)
if (p_atomic_dec_return(&bo->ref_count))
return;
radv_amdgpu_log_bo(bo, true);
if (bo->is_virtual) {
for (uint32_t i = 0; i < bo->range_count; ++i) {
radv_amdgpu_winsys_virtual_unmap(bo, bo->ranges + i);
@ -391,6 +428,8 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws,
bo->ranges[0].bo_offset = 0;
radv_amdgpu_winsys_virtual_map(bo, bo->ranges);
radv_amdgpu_log_bo(bo, false);
return (struct radeon_winsys_bo *)bo;
}
@ -485,6 +524,8 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws,
align64(bo->size, ws->info.gart_page_size));
radv_amdgpu_add_buffer_to_global_list(bo);
radv_amdgpu_log_bo(bo, false);
return (struct radeon_winsys_bo *)bo;
error_va_map:
amdgpu_bo_free(buf_handle);
@ -592,6 +633,8 @@ radv_amdgpu_winsys_bo_from_ptr(struct radeon_winsys *_ws,
align64(bo->size, ws->info.gart_page_size));
radv_amdgpu_add_buffer_to_global_list(bo);
radv_amdgpu_log_bo(bo, false);
return (struct radeon_winsys_bo *)bo;
error_va_map:
@ -672,6 +715,8 @@ radv_amdgpu_winsys_bo_from_fd(struct radeon_winsys *_ws,
align64(bo->size, ws->info.gart_page_size));
radv_amdgpu_add_buffer_to_global_list(bo);
radv_amdgpu_log_bo(bo, false);
return (struct radeon_winsys_bo *)bo;
error_va_map:
amdgpu_va_range_free(va_handle);
@ -862,6 +907,24 @@ static int radv_amdgpu_bo_va_compare(const void *a, const void *b)
return bo_a->base.va < bo_b->base.va ? -1 : bo_a->base.va > bo_b->base.va ? 1 : 0;
}
static void radv_amdgpu_dump_bo_log(struct radeon_winsys *_ws, FILE *file)
{
struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
struct radv_amdgpu_winsys_bo_log *bo_log;
if (!ws->debug_log_bos)
return;
u_rwlock_rdlock(&ws->log_bo_list_lock);
LIST_FOR_EACH_ENTRY(bo_log, &ws->log_bo_list, list) {
fprintf(file, "timestamp=%llu, VA=%.16llx-%.16llx, destroyed=%d, is_virtual=%d\n",
(long long)bo_log->timestamp, (long long)bo_log->va,
(long long)(bo_log->va + bo_log->size),
bo_log->destroyed, bo_log->is_virtual);
}
u_rwlock_rdunlock(&ws->log_bo_list_lock);
}
static void radv_amdgpu_dump_bo_ranges(struct radeon_winsys *_ws, FILE *file)
{
struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
@ -905,4 +968,5 @@ void radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws)
ws->base.buffer_virtual_bind = radv_amdgpu_winsys_bo_virtual_bind;
ws->base.buffer_get_flags_from_fd = radv_amdgpu_bo_get_flags_from_fd;
ws->base.dump_bo_ranges = radv_amdgpu_dump_bo_ranges;
ws->base.dump_bo_log = radv_amdgpu_dump_bo_log;
}

View File

@ -168,6 +168,7 @@ static void radv_amdgpu_winsys_destroy(struct radeon_winsys *rws)
pthread_mutex_destroy(&ws->syncobj_lock);
u_rwlock_destroy(&ws->global_bo_list_lock);
u_rwlock_destroy(&ws->log_bo_list_lock);
ac_addrlib_destroy(ws->addrlib);
amdgpu_device_deinitialize(ws->dev);
FREE(rws);
@ -195,6 +196,7 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags)
goto winsys_fail;
ws->debug_all_bos = !!(debug_flags & RADV_DEBUG_ALL_BOS);
ws->debug_log_bos = debug_flags & RADV_DEBUG_HANG;
if (debug_flags & RADV_DEBUG_NO_IBS)
ws->use_ib_bos = false;
@ -203,6 +205,8 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags)
ws->use_llvm = debug_flags & RADV_DEBUG_LLVM;
list_inithead(&ws->global_bo_list);
u_rwlock_init(&ws->global_bo_list_lock);
list_inithead(&ws->log_bo_list);
u_rwlock_init(&ws->log_bo_list_lock);
pthread_mutex_init(&ws->syncobj_lock, NULL);
ws->base.query_info = radv_amdgpu_winsys_query_info;
ws->base.query_value = radv_amdgpu_winsys_query_value;

View File

@ -44,6 +44,7 @@ struct radv_amdgpu_winsys {
struct ac_addrlib *addrlib;
bool debug_all_bos;
bool debug_log_bos;
bool use_ib_bos;
bool zero_all_vram_allocs;
bool use_local_bos;
@ -61,6 +62,10 @@ struct radv_amdgpu_winsys {
pthread_mutex_t syncobj_lock;
uint32_t *syncobj;
uint32_t syncobj_count, syncobj_capacity;
/* BO log */
struct u_rwlock log_bo_list_lock;
struct list_head log_bo_list;
};
static inline struct radv_amdgpu_winsys *