turnip: Use the DRM or KGSL GPU reset status ioctls to report device loss.

ANGLE-on-venus-on-turnip and zink-on-turnip want real data here for EGL's
reset tests.

This required moving the remaining GPU-reset-causing tests from flakes or
xfails to skips.  Otherwise, the rest of the caselist associated with them
ends up being marked as fails as well.  The alternative would be to put
these tests in their own test groups with tests_per_group = 1, but that
didn't seem worth the effort.  Or, we could finally do something with
https://gitlab.freedesktop.org/anholt/deqp-runner/-/issues/14.

Fixes: #5955
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14839>
This commit is contained in:
Emma Anholt 2022-02-02 12:59:54 -08:00 committed by Marge Bot
parent add2121969
commit 2f25d16653
10 changed files with 79 additions and 24 deletions

View File

@ -1,12 +1,6 @@
# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505
dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail
# CTS 1.3.1.0 uprev:
dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap,Fail
dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap,Fail
spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail
# Fails when TU_DEBUG=forcebin is set
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail

View File

@ -4,10 +4,3 @@
dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_2.alpha_opaque
dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_4.alpha_opaque
# Could trip hangcheck timeout
dEQP-VK.api.command_buffers.record_many_draws_primary_2
dEQP-VK.api.command_buffers.record_many_draws_secondary_2
# Sometimes hangchecks
spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code

View File

@ -25,11 +25,22 @@ dEQP-VK.ubo.random.all_shared_buffer.48
# Still running after 3 hours, time is spent in batch_draw_tracking().
KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs
# causes a hangcheck timeout on a630:
# causes a hangcheck timeout on a618:
# msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A618: hangcheck detected gpu lockup rb 0!
#
# even if they sometimes pass and could be categorized as flakes, we skip them
# because device loss will end up failing the rest of the caselist.
dEQP-VK.api.command_buffers.record_many_draws_primary_2
dEQP-VK.api.command_buffers.record_many_draws_secondary_2
dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies
spill-dEQP-VK.graphicsfuzz.cov-nested-loop-undefined-smoothstep-never-executed
spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code
spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-memory-accesses
# Hangs the GPU, fixed to be a skip in VK-GL-CTS 736eec57dc0c ("Fix checkSupport in compressed texture sampling tests")
dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap
dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap
# Crashes in RA, but slow enough to get there that CI times out sometimes
dEQP-VK.spirv_assembly.instruction.*.spirv_ids_abuse.lots_ids.*

View File

@ -34,9 +34,6 @@ bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.softlight,Fail
# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505
dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail
# Showed up with VK-GL-CTS 1.3.1.0:
spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail
# Fails when TU_DEBUG=forcebin is set
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail

View File

@ -91,13 +91,6 @@ dEQP-GLES31.functional.layout_binding.ssbo.fragment_binding_array
dEQP-GLES3.functional.fbo.blit.conversion.rg8i_to_r16i
dEQP-GLES3.functional.fbo.blit.conversion.rg8_to_r16f
# Could trip hangcheck timeout
dEQP-VK.api.command_buffers.record_many_draws_primary_2
dEQP-VK.api.command_buffers.record_many_draws_secondary_2
# Looks likely to be a hangcheck trigger.
spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components
# First noticed Jun 1 2020 on an innocent branch.
KHR-GL33.packed_depth_stencil.verify_copy_tex_image.depth32f_stencil8

View File

@ -18,6 +18,15 @@ dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_equal_spacing
dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_even_spacing
dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_odd_spacing
# Can cause a hangcheck.
#
# even if they sometimes pass and could be categorized as flakes, we skip them
# because device loss will end up failing the rest of the caselist.
dEQP-VK.api.command_buffers.record_many_draws_primary_2
dEQP-VK.api.command_buffers.record_many_draws_secondary_2
spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies
spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components
# timeout, spending all its time in nir_compare_deref_paths()
# https://gitlab.freedesktop.org/mesa/mesa/-/issues/5152
dEQP-VK.ubo.random.all_shared_buffer.48

View File

@ -1699,6 +1699,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
device->instance = physical_device->instance;
device->physical_device = physical_device;
device->fd = physical_device->local_fd;
device->vk.check_status = tu_device_check_status;
mtx_init(&device->bo_mutex, mtx_plain);
u_rwlock_init(&device->dma_bo_lock);

View File

@ -137,6 +137,23 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
return ret;
}
VkResult
tu_device_check_status(struct vk_device *vk_device)
{
struct tu_device *device = container_of(vk_device, struct tu_device, vk);
struct tu_physical_device *physical_device = device->physical_device;
uint64_t last_fault_count = physical_device->fault_count;
int ret = tu_drm_get_param(physical_device, MSM_PARAM_FAULTS, &physical_device->fault_count);
if (ret != 0)
return vk_device_set_lost(&device->vk, "error getting GPU fault count: %d", ret);
if (last_fault_count != physical_device->fault_count)
return vk_device_set_lost(&device->vk, "GPU faulted or hung");
return VK_SUCCESS;
}
int
tu_drm_submitqueue_new(const struct tu_device *dev,
int priority,
@ -729,6 +746,13 @@ tu_drm_device_init(struct tu_physical_device *device,
goto fail;
}
int ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count);
if (ret != 0) {
result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
"Failed to get initial fault count: %d", ret);
goto fail;
}
device->syncobj_type = vk_drm_syncobj_get_type(fd);
device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type);

View File

@ -706,6 +706,33 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
return 0;
}
VkResult
tu_device_check_status(struct vk_device *vk_device)
{
struct tu_device *device = container_of(vk_device, struct tu_device, vk);
for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
for (unsigned q = 0; q < device->queue_count[i]; q++) {
/* KGSL's KGSL_PROP_GPU_RESET_STAT takes the u32 msm_queue_id and returns a
* KGSL_CTX_STAT_* for the worst reset that happened since the last time it
* was queried on that queue.
*/
uint32_t value = device->queues[i][q].msm_queue_id;
VkResult status = get_kgsl_prop(device->fd, KGSL_PROP_GPU_RESET_STAT,
&value, sizeof(value));
if (status != VK_SUCCESS)
return vk_device_set_lost(&device->vk, "Failed to get GPU reset status");
if (value != KGSL_CTX_STAT_NO_ERROR &&
value != KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT) {
return vk_device_set_lost(&device->vk, "GPU faulted or hung");
}
}
}
return VK_SUCCESS;
}
#ifdef ANDROID
VKAPI_ATTR VkResult VKAPI_CALL
tu_QueueSignalReleaseImageANDROID(VkQueue _queue,

View File

@ -236,6 +236,9 @@ struct tu_physical_device
int msm_major_version;
int msm_minor_version;
/* Address space and global fault count for this local_fd with DRM backend */
uint64_t fault_count;
/* This is the drivers on-disk cache used as a fallback as opposed to
* the pipeline cache defined by apps.
*/
@ -538,6 +541,9 @@ tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj
uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
VkResult
tu_device_check_status(struct vk_device *vk_device);
enum tu_bo_alloc_flags
{
TU_BO_ALLOC_NO_FLAGS = 0,