turnip: Use the DRM or KGSL GPU reset status ioctls to report device loss.

ANGLE-on-venus-on-turnip and zink-on-turnip want real data here for EGL's reset tests. This required moving the remaining GPU-reset-causing tests from flakes or xfails to skips. Otherwise, the rest of the caselist associated with them ends up being marked as fails as well. The alternative would be to put these tests in their own test groups with tests_per_group = 1, but that didn't seem worth the effort. Or, we could finally do something with https://gitlab.freedesktop.org/anholt/deqp-runner/-/issues/14. Fixes: #5955 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14839>
2022-02-02 12:59:54 -08:00 · 2022-02-02 12:59:54 -08:00 · 2f25d16653
parent add2121969
commit 2f25d16653
10 changed files with 79 additions and 24 deletions
--- a/src/freedreno/ci/freedreno-a618-fails.txt
+++ b/src/freedreno/ci/freedreno-a618-fails.txt
@ -1,12 +1,6 @@
 # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505
 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail

-# CTS 1.3.1.0 uprev:
-dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap,Fail
-dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap,Fail
-
-spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail
-
 # Fails when TU_DEBUG=forcebin is set
 gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
 gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
--- a/src/freedreno/ci/freedreno-a618-flakes.txt
+++ b/src/freedreno/ci/freedreno-a618-flakes.txt
@ -4,10 +4,3 @@

 dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_2.alpha_opaque
 dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_4.alpha_opaque
-
-# Could trip hangcheck timeout
-dEQP-VK.api.command_buffers.record_many_draws_primary_2
-dEQP-VK.api.command_buffers.record_many_draws_secondary_2
-
-# Sometimes hangchecks
-spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code
--- a/src/freedreno/ci/freedreno-a618-skips.txt
+++ b/src/freedreno/ci/freedreno-a618-skips.txt
@ -25,11 +25,22 @@ dEQP-VK.ubo.random.all_shared_buffer.48
 # Still running after 3 hours, time is spent in batch_draw_tracking().
 KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs

-# causes a hangcheck timeout on a630:
+# causes a hangcheck timeout on a618:
 # msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A618: hangcheck detected gpu lockup rb 0!
+#
+# even if they sometimes pass and could be categorized as flakes, we skip them
+# because device loss will end up failing the rest of the caselist.
+dEQP-VK.api.command_buffers.record_many_draws_primary_2
+dEQP-VK.api.command_buffers.record_many_draws_secondary_2
 dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies
 spill-dEQP-VK.graphicsfuzz.cov-nested-loop-undefined-smoothstep-never-executed
+spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code
 spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-memory-accesses

+# Hangs the GPU, fixed to be a skip in VK-GL-CTS 736eec57dc0c ("Fix checkSupport in compressed texture sampling tests")
+dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap
+dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap
+
 # Crashes in RA, but slow enough to get there that CI times out sometimes
 dEQP-VK.spirv_assembly.instruction.*.spirv_ids_abuse.lots_ids.*
--- a/src/freedreno/ci/freedreno-a630-fails.txt
+++ b/src/freedreno/ci/freedreno-a630-fails.txt
@ -34,9 +34,6 @@ bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.softlight,Fail
 # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505
 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail

-# Showed up with VK-GL-CTS 1.3.1.0:
-spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail
-
 # Fails when TU_DEBUG=forcebin is set
 gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
 gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
--- a/src/freedreno/ci/freedreno-a630-flakes.txt
+++ b/src/freedreno/ci/freedreno-a630-flakes.txt
@ -91,13 +91,6 @@ dEQP-GLES31.functional.layout_binding.ssbo.fragment_binding_array
 dEQP-GLES3.functional.fbo.blit.conversion.rg8i_to_r16i
 dEQP-GLES3.functional.fbo.blit.conversion.rg8_to_r16f

-# Could trip hangcheck timeout
-dEQP-VK.api.command_buffers.record_many_draws_primary_2
-dEQP-VK.api.command_buffers.record_many_draws_secondary_2
-
-# Looks likely to be a hangcheck trigger.
-spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components
-
 # First noticed Jun 1 2020 on an innocent branch.
 KHR-GL33.packed_depth_stencil.verify_copy_tex_image.depth32f_stencil8

--- a/src/freedreno/ci/freedreno-a630-skips.txt
+++ b/src/freedreno/ci/freedreno-a630-skips.txt
@ -18,6 +18,15 @@ dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_equal_spacing
 dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_even_spacing
 dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_odd_spacing

+# Can cause a hangcheck.
+#
+# even if they sometimes pass and could be categorized as flakes, we skip them
+# because device loss will end up failing the rest of the caselist.
+dEQP-VK.api.command_buffers.record_many_draws_primary_2
+dEQP-VK.api.command_buffers.record_many_draws_secondary_2
+spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies
+spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components
+
 # timeout, spending all its time in nir_compare_deref_paths()
 # https://gitlab.freedesktop.org/mesa/mesa/-/issues/5152
 dEQP-VK.ubo.random.all_shared_buffer.48
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@ -1699,6 +1699,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   device->instance = physical_device->instance;
   device->physical_device = physical_device;
   device->fd = physical_device->local_fd;
+   device->vk.check_status = tu_device_check_status;

   mtx_init(&device->bo_mutex, mtx_plain);
   u_rwlock_init(&device->dma_bo_lock);
--- a/src/freedreno/vulkan/tu_drm.c
+++ b/src/freedreno/vulkan/tu_drm.c
@ -137,6 +137,23 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
   return ret;
 }

+VkResult
+tu_device_check_status(struct vk_device *vk_device)
+{
+   struct tu_device *device = container_of(vk_device, struct tu_device, vk);
+   struct tu_physical_device *physical_device = device->physical_device;
+
+   uint64_t last_fault_count = physical_device->fault_count;
+   int ret = tu_drm_get_param(physical_device, MSM_PARAM_FAULTS, &physical_device->fault_count);
+   if (ret != 0)
+      return vk_device_set_lost(&device->vk, "error getting GPU fault count: %d", ret);
+
+   if (last_fault_count != physical_device->fault_count)
+      return vk_device_set_lost(&device->vk, "GPU faulted or hung");
+
+   return VK_SUCCESS;
+}
+
 int
 tu_drm_submitqueue_new(const struct tu_device *dev,
                       int priority,
@ -729,6 +746,13 @@ tu_drm_device_init(struct tu_physical_device *device,
      goto fail;
   }

+   int ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count);
+   if (ret != 0) {
+      result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                                 "Failed to get initial fault count: %d", ret);
+      goto fail;
+   }
+
   device->syncobj_type = vk_drm_syncobj_get_type(fd);
   device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type);

--- a/src/freedreno/vulkan/tu_kgsl.c
+++ b/src/freedreno/vulkan/tu_kgsl.c
@ -706,6 +706,33 @@ tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
   return 0;
 }

+VkResult
+tu_device_check_status(struct vk_device *vk_device)
+{
+   struct tu_device *device = container_of(vk_device, struct tu_device, vk);
+
+   for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
+      for (unsigned q = 0; q < device->queue_count[i]; q++) {
+         /* KGSL's KGSL_PROP_GPU_RESET_STAT takes the u32 msm_queue_id and returns a
+         * KGSL_CTX_STAT_* for the worst reset that happened since the last time it
+         * was queried on that queue.
+         */
+         uint32_t value = device->queues[i][q].msm_queue_id;
+         VkResult status = get_kgsl_prop(device->fd, KGSL_PROP_GPU_RESET_STAT,
+                                       &value, sizeof(value));
+         if (status != VK_SUCCESS)
+            return vk_device_set_lost(&device->vk, "Failed to get GPU reset status");
+
+         if (value != KGSL_CTX_STAT_NO_ERROR &&
+            value != KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT) {
+            return vk_device_set_lost(&device->vk, "GPU faulted or hung");
+         }
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
 #ifdef ANDROID
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_QueueSignalReleaseImageANDROID(VkQueue _queue,
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -236,6 +236,9 @@ struct tu_physical_device
   int msm_major_version;
   int msm_minor_version;

+   /* Address space and global fault count for this local_fd with DRM backend */
+   uint64_t fault_count;
+
   /* This is the drivers on-disk cache used as a fallback as opposed to
    * the pipeline cache defined by apps.
    */
@ -538,6 +541,9 @@ tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj
 uint64_t
 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);

+VkResult
+tu_device_check_status(struct vk_device *vk_device);
+
 enum tu_bo_alloc_flags
 {
   TU_BO_ALLOC_NO_FLAGS = 0,