radv: Add the fuchsia radix sort

Signed-off-by: Konstantin Seurer <konstantin.seurer@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15648>
2022-04-07 15:39:52 +02:00 · 2022-04-07 15:39:52 +02:00 · 5d9ef0efb5
parent 9c020b525b
commit 5d9ef0efb5
33 changed files with 6658 additions and 2 deletions
--- a/meson.build
+++ b/meson.build
@ -680,7 +680,7 @@ if with_gallium_d3d12 or with_microsoft_clc or with_microsoft_vk
  endif
 endif

-if with_vulkan_overlay_layer or with_aco_tests
+if with_vulkan_overlay_layer or with_aco_tests or with_amd_vk
  prog_glslang = find_program('glslangValidator')
 endif

--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@ -120,6 +120,9 @@ if with_llvm
  )
 endif

+subdir('radix_sort')
+libradv_files += radix_sort_files
+
 radv_deps = []
 radv_flags = cc.get_supported_arguments(['-Wimplicit-fallthrough', '-Wshadow'])

@ -151,7 +154,7 @@ endif

 libvulkan_radeon = shared_library(
  'vulkan_radeon',
-  [libradv_files, radv_entrypoints, sha1_h],
+  [libradv_files, radv_entrypoints, sha1_h, radix_sort_spv],
  vs_module_defs : vulkan_api_def,
  include_directories : [
    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_amd, inc_amd_common, inc_amd_common_llvm, inc_compiler, inc_util,
--- a/src/amd/vulkan/radix_sort/LICENSE
+++ b/src/amd/vulkan/radix_sort/LICENSE
@ -0,0 +1,24 @@
+Copyright 2019 The Fuchsia Authors.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/src/amd/vulkan/radix_sort/common/macros.h
+++ b/src/amd/vulkan/radix_sort/common/macros.h
@ -0,0 +1,112 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
+
+//
+//
+//
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+//
+// clang-format off
+//
+
+#define ARRAY_LENGTH_MACRO(x_)          (sizeof(x_)/sizeof(x_[0]))
+#define OFFSETOF_MACRO(t_,m_)           offsetof(t_,m_)
+#define MEMBER_SIZE_MACRO(t_,m_)        sizeof(((t_*)0)->m_)
+
+//
+// FIXME(allanmac):
+//
+// Consider providing typed min/max() functions:
+//
+//   <type> [min|max]_<type>(a,b) { ; }
+//
+// But note we still need preprocessor-time min/max().
+//
+
+#define MAX_MACRO(t_,a_,b_)             (((a_) > (b_)) ? (a_) : (b_))
+#define MIN_MACRO(t_,a_,b_)             (((a_) < (b_)) ? (a_) : (b_))
+
+//
+//
+//
+
+#define BITS_TO_MASK_MACRO(n_)          (((uint32_t)1<<(n_))-1)
+#define BITS_TO_MASK_64_MACRO(n_)       (((uint64_t)1<<(n_))-1)
+
+#define BITS_TO_MASK_AT_MACRO(n_,b_)    (BITS_TO_MASK_MACRO(n_)   <<(b_))
+#define BITS_TO_MASK_AT_64_MACRO(n_,b_) (BITS_TO_MASK_64_MACRO(n_)<<(b_))
+
+//
+//
+//
+
+#define STRINGIFY_MACRO_2(a_)           #a_
+#define STRINGIFY_MACRO(a_)             STRINGIFY_MACRO_2(a_)
+
+//
+//
+//
+
+#define CONCAT_MACRO_2(a_,b_)           a_ ## b_
+#define CONCAT_MACRO(a_,b_)             CONCAT_MACRO_2(a_,b_)
+
+//
+// Round up/down
+//
+
+#define ROUND_DOWN_MACRO(v_,q_)         (((v_) / (q_)) * (q_))
+#define ROUND_UP_MACRO(v_,q_)           ((((v_) + (q_) - 1) / (q_)) * (q_))
+
+//
+// Round up/down when q is a power-of-two.
+//
+
+#define ROUND_DOWN_POW2_MACRO(v_,q_)    ((v_) & ~((q_) - 1))
+#define ROUND_UP_POW2_MACRO(v_,q_)      ROUND_DOWN_POW2_MACRO((v_) + (q_) - 1, q_)
+
+//
+//
+//
+
+#if defined (_MSC_VER) && !defined (__clang__)
+#define STATIC_ASSERT_MACRO(c_,m_)      static_assert(c_,m_)
+#else
+#define STATIC_ASSERT_MACRO(c_,m_)      _Static_assert(c_,m_)
+#endif
+
+#define STATIC_ASSERT_MACRO_1(c_)       STATIC_ASSERT_MACRO(c_,#c_)
+
+//
+//
+//
+
+#if defined (_MSC_VER) && !defined (__clang__)
+#define POPCOUNT_MACRO(...)             __popcnt(__VA_ARGS__)
+#else
+#define POPCOUNT_MACRO(...)             __builtin_popcount(__VA_ARGS__)
+#endif
+
+//
+//
+//
+
+#if defined (_MSC_VER) && !defined (__clang__)
+#define ALIGN_MACRO(bytes_)             __declspec(align(bytes_)) // only accepts integer as arg
+#else
+#include <stdalign.h>
+#define ALIGN_MACRO(bytes_)             alignas(bytes_)
+#endif
+
+//
+// clang-format on
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
--- a/src/amd/vulkan/radix_sort/common/util.c
+++ b/src/amd/vulkan/radix_sort/common/util.c
@ -0,0 +1,90 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "util.h"
+
+#include <assert.h>
+
+//
+//
+//
+
+#if defined(_MSC_VER) && !defined(__clang__)
+
+#include <intrin.h>
+
+#endif
+
+//
+//
+//
+
+bool
+is_pow2_u32(uint32_t n)
+{
+  return n && !(n & (n - 1));
+}
+
+//
+//
+//
+
+uint32_t
+pow2_ru_u32(uint32_t n)
+{
+  assert(n <= 0x80000000U);
+
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n++;
+
+  return n;
+}
+
+//
+//
+//
+
+uint32_t
+pow2_rd_u32(uint32_t n)
+{
+  assert(n > 0);
+
+  return 1u << msb_idx_u32(n);
+}
+
+//
+// ASSUMES NON-ZERO
+//
+
+uint32_t
+msb_idx_u32(uint32_t n)
+{
+  assert(n > 0);
+#if defined(_MSC_VER) && !defined(__clang__)
+
+  uint32_t index;
+
+  _BitScanReverse((unsigned long *)&index, n);
+
+  return index;
+
+#elif defined(__GNUC__)
+
+  return __builtin_clz(n) ^ 31;
+
+#else
+
+#error "No msb_index()"
+
+#endif
+}
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/common/util.h
+++ b/src/amd/vulkan/radix_sort/common/util.h
@ -0,0 +1,59 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
+
+//
+//
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+//
+//
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+//
+//
+
+// Return true iff |n| is a power of 2.
+bool
+is_pow2_u32(uint32_t n);
+
+// Return |n| rounded-up to the nearest power of 2.
+// If |n| is zero then return 0.
+// REQUIRES: |n <= 0x80000000|.
+uint32_t
+pow2_ru_u32(uint32_t n);
+
+// Return |n| rounded-down to the nearest power of 2.
+// REQUIRES: |n > 0|.
+uint32_t
+pow2_rd_u32(uint32_t n);
+
+// Return the most-significant bit position for |n|.
+// REQUIRES: |n > 0|.
+uint32_t
+msb_idx_u32(uint32_t n);  // 0-based bit position
+
+//
+//
+//
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
--- a/src/amd/vulkan/radix_sort/common/vk/assert.c
+++ b/src/amd/vulkan/radix_sort/common/vk/assert.c
@ -0,0 +1,108 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+//
+//
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "assert.h"
+
+//
+//
+//
+
+#define VK_RESULT_TO_STRING(result)                                                                \
+  case result:                                                                                     \
+    return #result
+
+//
+// FIXME -- results and errors
+//
+
+char const *
+vk_get_result_string(VkResult const result)
+{
+  switch (result)
+    {
+      //
+      // Results
+      //
+      VK_RESULT_TO_STRING(VK_SUCCESS);
+      VK_RESULT_TO_STRING(VK_NOT_READY);
+      VK_RESULT_TO_STRING(VK_TIMEOUT);
+      VK_RESULT_TO_STRING(VK_EVENT_SET);
+      VK_RESULT_TO_STRING(VK_EVENT_RESET);
+      VK_RESULT_TO_STRING(VK_INCOMPLETE);
+      //
+      // Errors
+      //
+      VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_HOST_MEMORY);
+      VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      VK_RESULT_TO_STRING(VK_ERROR_INITIALIZATION_FAILED);
+      VK_RESULT_TO_STRING(VK_ERROR_DEVICE_LOST);
+      VK_RESULT_TO_STRING(VK_ERROR_MEMORY_MAP_FAILED);
+      VK_RESULT_TO_STRING(VK_ERROR_LAYER_NOT_PRESENT);
+      VK_RESULT_TO_STRING(VK_ERROR_EXTENSION_NOT_PRESENT);
+      VK_RESULT_TO_STRING(VK_ERROR_FEATURE_NOT_PRESENT);
+      VK_RESULT_TO_STRING(VK_ERROR_INCOMPATIBLE_DRIVER);
+      VK_RESULT_TO_STRING(VK_ERROR_TOO_MANY_OBJECTS);
+      VK_RESULT_TO_STRING(VK_ERROR_FORMAT_NOT_SUPPORTED);
+      VK_RESULT_TO_STRING(VK_ERROR_FRAGMENTED_POOL);
+      VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_POOL_MEMORY);
+      VK_RESULT_TO_STRING(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      VK_RESULT_TO_STRING(VK_ERROR_SURFACE_LOST_KHR);
+      VK_RESULT_TO_STRING(VK_ERROR_NATIVE_WINDOW_IN_USE_KHR);
+      VK_RESULT_TO_STRING(VK_SUBOPTIMAL_KHR);
+      VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_DATE_KHR);
+      VK_RESULT_TO_STRING(VK_ERROR_INCOMPATIBLE_DISPLAY_KHR);
+      VK_RESULT_TO_STRING(VK_ERROR_VALIDATION_FAILED_EXT);
+      VK_RESULT_TO_STRING(VK_ERROR_INVALID_SHADER_NV);
+      VK_RESULT_TO_STRING(VK_ERROR_FRAGMENTATION_EXT);
+      VK_RESULT_TO_STRING(VK_ERROR_NOT_PERMITTED_EXT);
+
+        //
+        // Extensions: vk_xyz
+        //
+      default:
+        return "UNKNOWN VULKAN RESULT";
+    }
+}
+
+//
+//
+//
+
+VkResult
+vk_assert(VkResult const result, char const * const file, int const line, bool const is_abort)
+{
+  if (result != VK_SUCCESS)
+    {
+      char const * const vk_result_str = vk_get_result_string(result);
+
+      fprintf(stderr,
+              "\"%s\", line %d: vk_assert( %d ) = \"%s\"\n",
+              file,
+              line,
+              result,
+              vk_result_str);
+
+      if (is_abort)
+        {
+          abort();
+        }
+    }
+
+  return result;
+}
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/common/vk/assert.h
+++ b/src/amd/vulkan/radix_sort/common/vk/assert.h
@ -0,0 +1,52 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
+
+//
+//
+//
+
+#include <stdbool.h>
+#include <vulkan/vulkan.h>
+
+//
+//
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+//
+//
+
+char const *
+vk_get_result_string(VkResult const result);
+
+VkResult
+vk_assert(VkResult const result, char const * const file, int const line, bool const is_abort);
+
+//
+// clang-format off
+//
+
+#define vk(...)    vk_assert((vk##__VA_ARGS__), __FILE__, __LINE__, true);
+#define vk_ok(err) vk_assert(err,               __FILE__, __LINE__, true);
+
+//
+// clang-format on
+//
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
--- a/src/amd/vulkan/radix_sort/common/vk/barrier.c
+++ b/src/amd/vulkan/radix_sort/common/vk/barrier.c
@ -0,0 +1,305 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+//
+//
+//
+
+#include "barrier.h"
+
+//
+//
+//
+
+void
+vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       VK_PIPELINE_STAGE_TRANSFER_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_SHADER_READ_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_TRANSFER_BIT,
+                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_TRANSFER_BIT,
+                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT |  //
+                     VK_ACCESS_SHADER_READ_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT |  //
+                     VK_ACCESS_SHADER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       VK_PIPELINE_STAGE_TRANSFER_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_HOST_READ_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+                       VK_PIPELINE_STAGE_HOST_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_HOST_READ_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_TRANSFER_BIT,
+                       VK_PIPELINE_STAGE_HOST_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
+
+void
+vk_memory_barrier(VkCommandBuffer      cb,
+                  VkPipelineStageFlags src_stage,
+                  VkAccessFlags        src_mask,
+                  VkPipelineStageFlags dst_stage,
+                  VkAccessFlags        dst_mask)
+{
+  VkMemoryBarrier const mb = { .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+                               .pNext         = NULL,
+                               .srcAccessMask = src_mask,
+                               .dstAccessMask = dst_mask };
+
+  vkCmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
+}
+
+//
+//
+//
+
+void
+vk_barrier_debug(VkCommandBuffer cb)
+{
+  static VkMemoryBarrier const mb = {
+
+    .sType         = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+    .pNext         = NULL,
+    .srcAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT |           //
+                     VK_ACCESS_INDEX_READ_BIT |                      //
+                     VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |           //
+                     VK_ACCESS_UNIFORM_READ_BIT |                    //
+                     VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |           //
+                     VK_ACCESS_SHADER_READ_BIT |                     //
+                     VK_ACCESS_SHADER_WRITE_BIT |                    //
+                     VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |           //
+                     VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |          //
+                     VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |   //
+                     VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |  //
+                     VK_ACCESS_TRANSFER_READ_BIT |                   //
+                     VK_ACCESS_TRANSFER_WRITE_BIT |                  //
+                     VK_ACCESS_HOST_READ_BIT |                       //
+                     VK_ACCESS_HOST_WRITE_BIT,
+    .dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT |           //
+                     VK_ACCESS_INDEX_READ_BIT |                      //
+                     VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |           //
+                     VK_ACCESS_UNIFORM_READ_BIT |                    //
+                     VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |           //
+                     VK_ACCESS_SHADER_READ_BIT |                     //
+                     VK_ACCESS_SHADER_WRITE_BIT |                    //
+                     VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |           //
+                     VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |          //
+                     VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |   //
+                     VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |  //
+                     VK_ACCESS_TRANSFER_READ_BIT |                   //
+                     VK_ACCESS_TRANSFER_WRITE_BIT |                  //
+                     VK_ACCESS_HOST_READ_BIT |                       //
+                     VK_ACCESS_HOST_WRITE_BIT
+  };
+
+  vkCmdPipelineBarrier(cb,
+                       VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                       VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                       0,
+                       1,
+                       &mb,
+                       0,
+                       NULL,
+                       0,
+                       NULL);
+}
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/common/vk/barrier.h
+++ b/src/amd/vulkan/radix_sort/common/vk/barrier.h
@ -0,0 +1,72 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
+
+//
+//
+//
+
+#include <vulkan/vulkan_core.h>
+
+//
+//
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+//
+//
+
+void
+vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb);
+
+void
+vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb);
+
+void
+vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb);
+
+void
+vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb);
+
+void
+vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb);
+
+void
+vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb);
+
+void
+vk_barrier_compute_w_to_host_r(VkCommandBuffer cb);
+
+void
+vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb);
+
+void
+vk_memory_barrier(VkCommandBuffer      cb,
+                  VkPipelineStageFlags src_stage,
+                  VkAccessFlags        src_mask,
+                  VkPipelineStageFlags dst_stage,
+                  VkAccessFlags        dst_mask);
+
+void
+vk_barrier_debug(VkCommandBuffer cb);
+
+//
+//
+//
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
--- a/src/amd/vulkan/radix_sort/meson.build
+++ b/src/amd/vulkan/radix_sort/meson.build
@ -0,0 +1,40 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+subdir('shaders')
+
+radix_sort_files = files(
+  'common/vk/assert.c',
+  'common/vk/assert.h',
+  'common/vk/barrier.c',
+  'common/vk/barrier.h',
+  'common/macros.h',
+  'common/util.c',
+  'common/util.h',
+  'shaders/push.h',
+  'targets/u64/config.h',
+  'radix_sort_vk_devaddr.h',
+  'radix_sort_vk_ext.h',
+  'radix_sort_vk.c',
+  'radix_sort_vk.h',
+  'radv_radix_sort.c',
+  'radv_radix_sort.h',
+  'target.h'
+)
--- a/src/amd/vulkan/radix_sort/radix_sort_vk.c
+++ b/src/amd/vulkan/radix_sort/radix_sort_vk.c
--- a/src/amd/vulkan/radix_sort/radix_sort_vk.h
+++ b/src/amd/vulkan/radix_sort/radix_sort_vk.h
@ -0,0 +1,384 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
+
+//
+//
+//
+
+#include <vulkan/vulkan_core.h>
+
+//
+//
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+//
+//
+//
+
+#include "target.h"
+
+//
+// Radix Sort Vk is a high-performance sorting library for Vulkan 1.2.
+//
+// The sorting function is both directly and indirectly dispatchable.
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Get a Radix Sort target's Vulkan requirements.
+//
+// A Radix Sort target is a binary image containing configuration parameters and
+// a bundle of SPIR-V modules.
+//
+// Targets are prebuilt and specific to a particular device vendor, architecture
+// and key-val configuration.
+//
+// A Radix Sort instance can only be created with a VkDevice that is initialized
+// with all of the target's required extensions and features.
+//
+// The `radix_sort_vk_target_get_requirements()` function yields the extensions
+// and initialized feature flags required by a Radix Sort target.
+//
+// These requirements can be merged with other Vulkan library requirements
+// before VkDevice creation.
+//
+// If the `.ext_names` member is NULL, the `.ext_name_count` member will be
+// initialized.
+//
+// Returns `false` if:
+//
+//   * The .ext_names field is NULL and the number of required extensions is
+//     greater than zero.
+//   * The .ext_name_count is less than the number of required extensions is
+//     greater than zero.
+//   * Any of the .pdf, .pdf11 or .pdf12 members are NULL.
+//
+// Otherwise, returns true.
+//
+typedef struct radix_sort_vk_target radix_sort_vk_target_t;
+
+//
+// NOTE: The library currently supports uint32_t and uint64_t keyvals.
+//
+
+#define RS_KV_DWORDS_MAX 2
+
+//
+//
+//
+
+struct rs_pipeline_layout_scatter
+{
+  VkPipelineLayout even;
+  VkPipelineLayout odd;
+};
+
+struct rs_pipeline_scatter
+{
+  VkPipeline even;
+  VkPipeline odd;
+};
+
+//
+//
+//
+
+struct rs_pipeline_layouts_named
+{
+  VkPipelineLayout                  init;
+  VkPipelineLayout                  fill;
+  VkPipelineLayout                  histogram;
+  VkPipelineLayout                  prefix;
+  struct rs_pipeline_layout_scatter scatter[RS_KV_DWORDS_MAX];
+};
+
+struct rs_pipelines_named
+{
+  VkPipeline                 init;
+  VkPipeline                 fill;
+  VkPipeline                 histogram;
+  VkPipeline                 prefix;
+  struct rs_pipeline_scatter scatter[RS_KV_DWORDS_MAX];
+};
+
+// clang-format off
+#define RS_PIPELINE_LAYOUTS_HANDLES (sizeof(struct rs_pipeline_layouts_named) / sizeof(VkPipelineLayout))
+#define RS_PIPELINES_HANDLES        (sizeof(struct rs_pipelines_named)        / sizeof(VkPipeline))
+// clang-format on
+
+//
+//
+//
+
+struct radix_sort_vk
+{
+  struct radix_sort_vk_target_config config;
+
+  union
+  {
+    struct rs_pipeline_layouts_named named;
+    VkPipelineLayout                 handles[RS_PIPELINE_LAYOUTS_HANDLES];
+  } pipeline_layouts;
+
+  union
+  {
+    struct rs_pipelines_named named;
+    VkPipeline                handles[RS_PIPELINES_HANDLES];
+  } pipelines;
+
+  struct
+  {
+    struct
+    {
+      VkDeviceSize offset;
+      VkDeviceSize range;
+    } histograms;
+
+    struct
+    {
+      VkDeviceSize offset;
+    } partitions;
+
+  } internal;
+};
+
+//
+// Create a Radix Sort instance for a target.(VkCommandBuffer                     cb,
+//
+// Keyval size is implicitly determined by the target.
+//
+// Returns NULL on failure.
+//
+typedef struct radix_sort_vk radix_sort_vk_t;
+
+//
+//
+//
+radix_sort_vk_t *
+radix_sort_vk_create(VkDevice                           device,
+                     VkAllocationCallbacks const *      ac,
+                     VkPipelineCache                    pc,
+                     const uint32_t* const*             spv,
+                     const uint32_t*                    spv_sizes,
+                     struct radix_sort_vk_target_config config);
+
+//
+// Destroy the Radix Sort instance using the same device and allocator used at
+// creation.
+//
+void
+radix_sort_vk_destroy(radix_sort_vk_t *             rs,  //
+                      VkDevice                      d,
+                      VkAllocationCallbacks const * ac);
+
+//
+// Returns the buffer size and alignment requirements for a maximum number of
+// keyvals.
+//
+// The radix sort implementation is not an in-place sorting algorithm so two
+// non-overlapping keyval buffers are required that are at least
+// `.keyvals_size`.
+//
+// The radix sort instance also requires an `internal` buffer during sorting.
+//
+// If the indirect dispatch sorting function is used, then an `indirect` buffer
+// is also required.
+//
+// The alignment requirements for the keyval, internal, and indirect buffers
+// must be honored.  All alignments are power of 2.
+//
+//   Input:
+//     count             : Maximum number of keyvals
+//
+//   Outputs:
+//     keyval_size       : Size of a single keyval
+//
+//     keyvals_size      : Minimum size of the even and odd keyval buffers
+//     keyvals_alignment : Alignment of each keyval buffer
+//
+//     internal_size     : Minimum size of internal buffer
+//     internal_aligment : Alignment of the internal buffer
+//
+//     indirect_size     : Minimum size of indirect buffer
+//     indirect_aligment : Alignment of the indirect buffer
+//
+//   .keyvals_even/odd
+//   -----------------
+//   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
+//   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
+//
+//   .internal
+//   ---------
+//   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
+//   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
+//   VK_BUFFER_USAGE_TRANSFER_DST_BIT ("direct" mode only)
+//
+//   .indirect
+//   ---------
+//   VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
+//   VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
+//   VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT
+//
+typedef struct radix_sort_vk_memory_requirements
+{
+  VkDeviceSize keyval_size;
+
+  VkDeviceSize keyvals_size;
+  VkDeviceSize keyvals_alignment;
+
+  VkDeviceSize internal_size;
+  VkDeviceSize internal_alignment;
+
+  VkDeviceSize indirect_size;
+  VkDeviceSize indirect_alignment;
+} radix_sort_vk_memory_requirements_t;
+
+void
+radix_sort_vk_get_memory_requirements(radix_sort_vk_t const *               rs,
+                                      uint32_t                              count,
+                                      radix_sort_vk_memory_requirements_t * mr);
+
+//
+// Direct dispatch sorting
+// -----------------------
+//
+// Using a key size of `key_bits`, sort `count` keyvals found in the
+// `.devaddr_keyvals_even` buffer.
+//
+// Each internal sorting pass copies the keyvals from one keyvals buffer to the
+// other.
+//
+// The number of internal sorting passes is determined by `.key_bits`.
+//
+// If an even number of internal sorting passes is required, the sorted keyvals
+// will be found in the "even" keyvals buffer.  Otherwise, the sorted keyvals
+// will be found in the "odd" keyvals buffer.
+//
+// Which buffer has the sorted keyvals is returned in `keyvals_sorted`.
+//
+// A keyval's `key_bits` are the most significant bits of a keyval.
+//
+// The maximum number of key bits is determined by the keyval size.
+//
+// The keyval count must be less than (1 << 30) as well as be less than or equal
+// to the count used to obtain the the memory requirements.
+//
+// The info struct's `ext` member must be NULL.
+//
+// This function appends push constants, dispatch commands, and barriers.
+//
+// Pipeline barriers should be applied as necessary, both before and after
+// invoking this function.
+//
+// The sort begins with either a TRANSFER/WRITE or a COMPUTE/READ to the
+// `internal` and `keyvals_even` buffers.
+//
+// The sort ends with a COMPUTE/WRITE to the `internal` and `keyvals_sorted`
+// buffers.
+//
+
+//
+// Direct dispatch sorting using VkDescriptorBufferInfo structures
+// ---------------------------------------------------------------
+//
+typedef struct radix_sort_vk_sort_info
+{
+  void *                 ext;
+  uint32_t               key_bits;
+  uint32_t               count;
+  VkDescriptorBufferInfo keyvals_even;
+  VkDescriptorBufferInfo keyvals_odd;
+  VkDescriptorBufferInfo internal;
+} radix_sort_vk_sort_info_t;
+
+void
+radix_sort_vk_sort(radix_sort_vk_t const *           rs,
+                   radix_sort_vk_sort_info_t const * info,
+                   VkDevice                          device,
+                   VkCommandBuffer                   cb,
+                   VkDescriptorBufferInfo *          keyvals_sorted);
+
+//
+// Indirect dispatch sorting
+// -------------------------
+//
+// Using a key size of `key_bits`, at pipeline execution time, load keyvals
+// count from `devaddr_count` and sorts the keyvals in `.devaddr_keyvals_even`.
+//
+// Each internal sorting pass copies the keyvals from one keyvals buffer to the
+// other.
+//
+// The number of internal sorting passes is determined by `.key_bits`.
+//
+// If an even number of internal sorting passes is required, the sorted keyvals
+// will be found in the "even" keyvals buffer.  Otherwise, the sorted keyvals
+// will be found in the "odd" keyvals buffer.
+//
+// Which buffer has the sorted keyvals is returned in `keyvals_sorted`.
+//
+// A keyval's `key_bits` are the most significant bits of a keyval.
+//
+// The keyval count must be less than (1 << 30) as well as be less than or equal
+// to the count used to obtain the the memory requirements.
+//
+// The info struct's `ext` member must be NULL.
+//
+// This function appends push constants, dispatch commands, and barriers.
+//
+// Pipeline barriers should be applied as necessary, both before and after
+// invoking this function.
+//
+// The indirect radix sort begins with a COMPUTE/READ from the `count` buffer
+// and ends with a COMPUTE/WRITE to the `internal` and the `keyvals_sorted`
+// buffers.
+//
+// The `indirect` buffer must support USAGE_INDIRECT.
+//
+// The `count` buffer must be at least 4 bytes and 4-byte aligned.
+//
+
+//
+// Indirect dispatch sorting using VkDescriptorBufferInfo structures
+// -----------------------------------------------------------------
+//
+typedef struct radix_sort_vk_sort_indirect_info
+{
+  void *                 ext;
+  uint32_t               key_bits;
+  VkDescriptorBufferInfo count;
+  VkDescriptorBufferInfo keyvals_even;
+  VkDescriptorBufferInfo keyvals_odd;
+  VkDescriptorBufferInfo internal;
+  VkDescriptorBufferInfo indirect;
+} radix_sort_vk_sort_indirect_info_t;
+
+void
+radix_sort_vk_sort_indirect(radix_sort_vk_t const *                    rs,
+                            radix_sort_vk_sort_indirect_info_t const * info,
+                            VkDevice                                   device,
+                            VkCommandBuffer                            cb,
+                            VkDescriptorBufferInfo *                   keyvals_sorted);
+
+//
+//
+//
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
--- a/src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h
+++ b/src/amd/vulkan/radix_sort/radix_sort_vk_devaddr.h
@ -0,0 +1,104 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
+
+//
+//
+//
+
+#include "radix_sort_vk.h"
+
+//
+//
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Structure that enables integration with Vulkan drivers.
+//
+typedef struct radix_sort_vk_buffer_info
+{
+  VkBuffer        buffer;
+  VkDeviceSize    offset;
+  VkDeviceAddress devaddr;
+} radix_sort_vk_buffer_info_t;
+
+//
+// Function prototypes
+//
+typedef void (*radix_sort_vk_fill_buffer_pfn)(VkCommandBuffer                     cb,
+                                              radix_sort_vk_buffer_info_t const * buffer_info,
+                                              VkDeviceSize                        offset,
+                                              VkDeviceSize                        size,
+                                              uint32_t                            data);
+
+typedef void (*radix_sort_vk_dispatch_indirect_pfn)(VkCommandBuffer                     cb,
+                                                    radix_sort_vk_buffer_info_t const * buffer_info,
+                                                    VkDeviceSize                        offset);
+
+//
+// Direct dispatch sorting using buffer device addresses
+// -----------------------------------------------------
+//
+typedef struct radix_sort_vk_sort_devaddr_info
+{
+  void *                        ext;
+  uint32_t                      key_bits;
+  uint32_t                      count;
+  radix_sort_vk_buffer_info_t   keyvals_even;
+  VkDeviceAddress               keyvals_odd;
+  radix_sort_vk_buffer_info_t   internal;
+  radix_sort_vk_fill_buffer_pfn fill_buffer;
+} radix_sort_vk_sort_devaddr_info_t;
+
+void
+radix_sort_vk_sort_devaddr(radix_sort_vk_t const *                   rs,
+                           radix_sort_vk_sort_devaddr_info_t const * info,
+                           VkDevice                                  device,
+                           VkCommandBuffer                           cb,
+                           VkDeviceAddress *                         keyvals_sorted);
+
+//
+// Indirect dispatch sorting using buffer device addresses
+// -------------------------------------------------------
+//
+// clang-format off
+//
+typedef struct radix_sort_vk_sort_indirect_devaddr_info
+{
+  void *                              ext;
+  uint32_t                            key_bits;
+  VkDeviceAddress                     count;
+  VkDeviceAddress                     keyvals_even;
+  VkDeviceAddress                     keyvals_odd;
+  VkDeviceAddress                     internal;
+  radix_sort_vk_buffer_info_t         indirect;
+  radix_sort_vk_dispatch_indirect_pfn dispatch_indirect;
+} radix_sort_vk_sort_indirect_devaddr_info_t;
+
+void
+radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const *                            rs,
+                                    radix_sort_vk_sort_indirect_devaddr_info_t const * info,
+                                    VkDevice                                           device,
+                                    VkCommandBuffer                                    cb,
+                                    VkDeviceAddress *                                  keyvals_sorted);
+
+//
+// clang-format on
+//
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
--- a/src/amd/vulkan/radix_sort/radix_sort_vk_ext.h
+++ b/src/amd/vulkan/radix_sort/radix_sort_vk_ext.h
@ -0,0 +1,77 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
+
+//
+//
+//
+
+#include <vulkan/vulkan_core.h>
+
+//
+//
+//
+
+#include <stdbool.h>
+#include <stdint.h>
+
+//
+//
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Radix sort extensions
+// ---------------------
+//
+#ifndef RADIX_SORT_VK_DISABLE_EXTENSIONS
+
+//
+// Extension types
+//
+enum radix_sort_vk_ext_type
+{
+  RADIX_SORT_VK_EXT_TIMESTAMPS
+};
+
+//
+// Timestamp each logical step of the algorithm
+//
+// Number of timestamps is: 5 + (number of subpasses)
+//
+//   * direct   dispatch: 4 + subpass count
+//   * indirect dispatch: 5 + subpass count
+//
+// Indirect / 32-bit keyvals: 9
+// Indirect / 64-bit keyvals: 13
+//
+struct radix_sort_vk_ext_timestamps
+{
+  void *                      ext;
+  enum radix_sort_vk_ext_type type;
+  uint32_t                    timestamp_count;
+  VkQueryPool                 timestamps;
+  uint32_t                    timestamps_set;
+};
+
+#endif
+
+//
+//
+//dsc
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.c
+++ b/src/amd/vulkan/radix_sort/radv_radix_sort.c
@ -0,0 +1,193 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "radv_radix_sort.h"
+#include "targets/u64/config.h"
+#include "radv_private.h"
+#include "target.h"
+
+static const uint32_t init_spv[] = {
+#include "radix_sort/shaders/init.comp.spv.h"
+};
+
+static const uint32_t fill_spv[] = {
+#include "radix_sort/shaders/fill.comp.spv.h"
+};
+
+static const uint32_t histogram_spv[] = {
+#include "radix_sort/shaders/histogram.comp.spv.h"
+};
+
+static const uint32_t prefix_spv[] = {
+#include "radix_sort/shaders/prefix.comp.spv.h"
+};
+
+static const uint32_t scatter_0_even_spv[] = {
+#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
+};
+
+static const uint32_t scatter_0_odd_spv[] = {
+#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
+};
+
+static const uint32_t scatter_1_even_spv[] = {
+#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
+};
+
+static const uint32_t scatter_1_odd_spv[] = {
+#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
+};
+
+static const struct radix_sort_vk_target_config target_config = {
+   .keyval_dwords = RS_KEYVAL_DWORDS,
+
+   .histogram =
+      {
+         .workgroup_size_log2 = RS_HISTOGRAM_WORKGROUP_SIZE_LOG2,
+         .subgroup_size_log2 = RS_HISTOGRAM_SUBGROUP_SIZE_LOG2,
+         .block_rows = RS_HISTOGRAM_BLOCK_ROWS,
+      },
+
+   .prefix =
+      {
+         .workgroup_size_log2 = RS_PREFIX_WORKGROUP_SIZE_LOG2,
+         .subgroup_size_log2 = RS_PREFIX_SUBGROUP_SIZE_LOG2,
+      },
+
+   .scatter =
+      {
+         .workgroup_size_log2 = RS_SCATTER_WORKGROUP_SIZE_LOG2,
+         .subgroup_size_log2 = RS_SCATTER_SUBGROUP_SIZE_LOG2,
+         .block_rows = RS_SCATTER_BLOCK_ROWS,
+      },
+};
+
+radix_sort_vk_t *
+radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc)
+{
+   const uint32_t *spv[8] = {
+      init_spv,           fill_spv,          histogram_spv,      prefix_spv,
+      scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
+   };
+   const uint32_t spv_sizes[8] = {
+      sizeof(init_spv),           sizeof(fill_spv),           sizeof(histogram_spv),
+      sizeof(prefix_spv),         sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv),
+      sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
+   };
+   return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, target_config);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+vkCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo,
+                     const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)
+{
+   RADV_FROM_HANDLE(radv_device, pdevice, device);
+   return pdevice->vk.dispatch_table.CreateShaderModule(device, pCreateInfo, pAllocator,
+                                                        pShaderModule);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+vkDestroyShaderModule(VkDevice device, VkShaderModule shaderModule,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   RADV_FROM_HANDLE(radv_device, pdevice, device);
+   pdevice->vk.dispatch_table.DestroyShaderModule(device, shaderModule, pAllocator);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+vkCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
+                       const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)
+{
+   RADV_FROM_HANDLE(radv_device, pdevice, device);
+   return pdevice->vk.dispatch_table.CreatePipelineLayout(device, pCreateInfo, pAllocator,
+                                                          pPipelineLayout);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+vkDestroyPipelineLayout(VkDevice device, VkPipelineLayout pipelineLayout,
+                        const VkAllocationCallbacks *pAllocator)
+{
+   RADV_FROM_HANDLE(radv_device, pdevice, device);
+   pdevice->vk.dispatch_table.DestroyPipelineLayout(device, pipelineLayout, pAllocator);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+vkCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount,
+                         const VkComputePipelineCreateInfo *pCreateInfos,
+                         const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
+{
+   RADV_FROM_HANDLE(radv_device, pdevice, device);
+   return pdevice->vk.dispatch_table.CreateComputePipelines(device, pipelineCache, createInfoCount,
+                                                            pCreateInfos, pAllocator, pPipelines);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+vkDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)
+{
+   RADV_FROM_HANDLE(radv_device, pdevice, device);
+   return pdevice->vk.dispatch_table.DestroyPipeline(device, pipeline, pAllocator);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+vkCmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
+                     VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags,
+                     uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
+                     uint32_t bufferMemoryBarrierCount,
+                     const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+                     uint32_t imageMemoryBarrierCount,
+                     const VkImageMemoryBarrier *pImageMemoryBarriers)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   cmd_buffer->device->vk.dispatch_table.CmdPipelineBarrier(
+      commandBuffer, srcStageMask, dstStageMask, dependencyFlags, memoryBarrierCount,
+      pMemoryBarriers, bufferMemoryBarrierCount, pBufferMemoryBarriers, imageMemoryBarrierCount,
+      pImageMemoryBarriers);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+vkCmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
+                   VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
+                   const void *pValues)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   cmd_buffer->device->vk.dispatch_table.CmdPushConstants(commandBuffer, layout, stageFlags, offset,
+                                                          size, pValues);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+vkCmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
+                  VkPipeline pipeline)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   cmd_buffer->device->vk.dispatch_table.CmdBindPipeline(commandBuffer, pipelineBindPoint,
+                                                         pipeline);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+vkCmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY,
+              uint32_t groupCountZ)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   cmd_buffer->device->vk.dispatch_table.CmdDispatch(commandBuffer, groupCountX, groupCountY,
+                                                     groupCountZ);
+}
--- a/src/amd/vulkan/radix_sort/radv_radix_sort.h
+++ b/src/amd/vulkan/radix_sort/radv_radix_sort.h
@ -0,0 +1,32 @@
+/*
+ * Copyright © 2022 Konstantin Seurer
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef RADV_RADIX_SORT_H
+#define RADV_RADIX_SORT_H
+
+#include "radix_sort_vk_devaddr.h"
+
+radix_sort_vk_t *radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
+                                            VkPipelineCache pc);
+
+#endif
--- a/src/amd/vulkan/radix_sort/shaders/bufref.h
+++ b/src/amd/vulkan/radix_sort/shaders/bufref.h
@ -0,0 +1,151 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
+
+//
+// GLSL
+//
+
+#ifdef VULKAN  // defined by GLSL/VK compiler
+
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+//
+// If the target does not support VkPhysicalDeviceFeatures.shaderInt64
+// then:
+//
+//   #define RS_DISABLE_SHADER_INT64
+//
+// clang-format off
+#ifdef RS_DISABLE_SHADER_INT64
+#extension GL_EXT_buffer_reference_uvec2 : require
+#else
+#extension GL_EXT_buffer_reference2      : require
+#endif
+// clang-format on
+
+//
+// Restrict shouldn't have any noticeable impact on these kernels and
+// benchmarks appear to prove that true but it's correct to include
+// the qualifier.
+//
+#define RS_RESTRICT restrict
+
+//
+// If the device doesn't support .shaderInt64 then the buffer reference address
+// is a uvec2.
+//
+#ifdef RS_DISABLE_SHADER_INT64
+#define RS_DEVADDR u32vec2
+#else
+#define RS_DEVADDR uint64_t
+#endif
+
+//
+// Define a buffer reference.
+//
+#define RS_BUFREF_DEFINE(_layout, _name, _devaddr) RS_RESTRICT _layout _name = _layout(_devaddr)
+
+//
+// Define a buffer reference at a UINT32 offset.
+//
+#ifdef RS_DISABLE_SHADER_INT64
+#define RS_BUFREF_DEFINE_AT_OFFSET_UINT32(_layout, _name, _devaddr_u32vec2, _offset)               \
+  RS_RESTRICT _layout _name;                                                                       \
+  {                                                                                                \
+    u32vec2  devaddr;                                                                              \
+    uint32_t carry;                                                                                \
+                                                                                                   \
+    devaddr.x = uaddCarry(_devaddr_u32vec2.x, _offset, carry);                                     \
+    devaddr.y = _devaddr_u32vec2.y + carry;                                                        \
+                                                                                                   \
+    _name = _layout(devaddr);                                                                      \
+  }
+#else
+#define RS_BUFREF_DEFINE_AT_OFFSET_UINT32(_layout, _name, _devaddr, _offset)                       \
+  RS_RESTRICT _layout _name = _layout(_devaddr + _offset)
+#endif
+
+//
+// Define a buffer reference at a packed UINT64 offset.
+//
+#ifdef RS_DISABLE_SHADER_INT64
+#define RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(_layout, _name, _devaddr_u32vec2, _offset_u32vec2)      \
+  RS_RESTRICT _layout _name;                                                                       \
+  {                                                                                                \
+    u32vec2  devaddr;                                                                              \
+    uint32_t carry;                                                                                \
+                                                                                                   \
+    devaddr.x = uaddCarry(_devaddr_u32vec2.x, _offset_u32vec2.x, carry);                           \
+    devaddr.y = _devaddr_u32vec2.y + _offset_u32vec2.y + carry;                                    \
+                                                                                                   \
+    _name = _layout(devaddr);                                                                      \
+  }
+#else
+#define RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(_layout, _name, _devaddr, _offset_u32vec2)              \
+  RS_RESTRICT _layout _name = _layout(_devaddr + pack64(_offset_u32vec2))
+#endif
+
+//
+// Increment the buffer reference by a UINT32 offset.
+//
+#ifdef RS_DISABLE_SHADER_INT64
+#define RS_BUFREF_INC_UINT32(_layout, _name, _inc)                                                 \
+  {                                                                                                \
+    u32vec2  devaddr = u32vec2(_name);                                                             \
+    uint32_t carry;                                                                                \
+                                                                                                   \
+    devaddr.x = uaddCarry(devaddr.x, _inc, carry);                                                 \
+    devaddr.y = devaddr.y + carry;                                                                 \
+                                                                                                   \
+    _name = _layout(devaddr);                                                                      \
+  }
+#else
+#define RS_BUFREF_INC_UINT32(_layout, _name, _inc) _name = _layout(uint64_t(_name) + _inc)
+#endif
+
+//
+// Increment the buffer reference by a packed UINT64 offset.
+//
+#ifdef RS_DISABLE_SHADER_INT64
+#define RS_BUFREF_INC_U32VEC2(_layout, _name, _inc_u32vec2)                                        \
+  {                                                                                                \
+    u32vec2  devaddr = u32vec2(_name);                                                             \
+    uint32_t carry;                                                                                \
+                                                                                                   \
+    devaddr.x = uaddCarry(devaddr.x, _inc_u32vec2.x, carry);                                       \
+    devaddr.y = devaddr.y + _inc_u32vec2.y + carry;                                                \
+                                                                                                   \
+    _name = _layout(devaddr);                                                                      \
+  }
+#else
+#define RS_BUFREF_INC_U32VEC2(_layout, _name, _inc_u32vec2)                                        \
+  _name = _layout(uint64_t(_name) + pack64(_inc_u32vec2))
+#endif
+
+//
+// Increment the buffer reference by the product of two UINT32 factors.
+//
+#define RS_BUFREF_INC_UINT32_UINT32(_layout, _name, _inc_a, _inc_b)                                \
+  {                                                                                                \
+    u32vec2 inc;                                                                                   \
+                                                                                                   \
+    umulExtended(_inc_a, _inc_b, inc.y, inc.x);                                                    \
+                                                                                                   \
+    RS_BUFREF_INC_U32VEC2(_layout, _name, inc);                                                    \
+  }
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
--- a/src/amd/vulkan/radix_sort/shaders/fill.comp
+++ b/src/amd/vulkan/radix_sort/shaders/fill.comp
@ -0,0 +1,143 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+// Each workgroup fills up to RS_BLOCK_KEYVALS
+//
+
+// clang-format off
+#extension GL_GOOGLE_include_directive    : require
+#extension GL_EXT_control_flow_attributes : require
+// clang-format on
+
+//
+// Load arch/keyval configuration
+//
+#include "config.h"
+
+//
+// Buffer reference macros and push constants
+//
+#include "bufref.h"
+#include "push.h"
+
+//
+// Subgroup uniform support
+//
+#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
+#extension GL_EXT_subgroupuniform_qualifier : required
+#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
+#else
+#define RS_SUBGROUP_UNIFORM
+#endif
+
+//
+// Declare the push constants
+//
+RS_STRUCT_PUSH_FILL();
+
+layout(push_constant) uniform block_push
+{
+  rs_push_fill push;
+};
+
+//
+// The "init" shader configures the fill info structure.
+//
+RS_STRUCT_INDIRECT_INFO_FILL();
+
+//
+// Check all switches are defined
+//
+#ifndef RS_FILL_WORKGROUP_SIZE_LOG2
+#error "Undefined: RS_FILL_WORKGROUP_SIZE_LOG2"
+#endif
+
+//
+#ifndef RS_FILL_BLOCK_ROWS
+#error "Undefined: RS_FILL_BLOCK_ROWS"
+#endif
+
+//
+// Local macros
+//
+// clang-format off
+#define RS_WORKGROUP_SIZE   (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
+#define RS_BLOCK_DWORDS     (RS_FILL_BLOCK_ROWS * RS_WORKGROUP_SIZE)
+#define RS_RADIX_MASK       ((1 << RS_RADIX_LOG2) - 1)
+// clang-format on
+
+//
+//
+//
+layout(local_size_x = RS_WORKGROUP_SIZE) in;
+
+//
+//
+//
+layout(buffer_reference, std430) buffer buffer_rs_indirect_info_fill
+{
+  rs_indirect_info_fill info;
+};
+
+layout(buffer_reference, std430) buffer buffer_rs_dwords
+{
+  uint32_t extent[];
+};
+
+//
+//
+//
+void
+main()
+{
+  //
+  // Define indirect info bufref for the fill
+  //
+  readonly RS_BUFREF_DEFINE(buffer_rs_indirect_info_fill, rs_info, push.devaddr_info);
+
+  RS_SUBGROUP_UNIFORM const rs_indirect_info_fill info = rs_info.info;
+
+  //
+  // Define dwords bufref
+  //
+  // Assumes less than 2^32-1 keys and then extended multiplies it by
+  // the keyval size.
+  //
+  // Assumes push.devaddr_dwords_base is suitably aligned to
+  // RS_BLOCK_DWORDS -- at a subgroup or transaction size is fine.
+  //
+  const uint32_t dwords_idx =
+    (info.block_offset + gl_WorkGroupID.x) * RS_BLOCK_DWORDS + gl_LocalInvocationID.x;
+
+  u32vec2 dwords_offset;
+
+  umulExtended(dwords_idx, 4, dwords_offset.y, dwords_offset.x);
+
+  writeonly RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_dwords,
+                                               rs_dwords,
+                                               push.devaddr_dwords,
+                                               dwords_offset);
+
+  //
+  // Fills are always aligned to RS_BLOCK_KEYVALS
+  //
+  // ((v >= min) && (v < max)) == ((v - min) < (max - min))
+  //
+  const uint32_t row_idx = dwords_idx - info.dword_offset_min;
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_FILL_BLOCK_ROWS; ii++)
+  {
+    if (row_idx + (ii * RS_WORKGROUP_SIZE) < info.dword_offset_max_minus_min)
+      {
+        rs_dwords.extent[ii * RS_WORKGROUP_SIZE] = push.dword;
+      }
+  }
+}
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/shaders/histogram.comp
+++ b/src/amd/vulkan/radix_sort/shaders/histogram.comp
@ -0,0 +1,449 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+// Produce multiple radix size histograms from the keyvals.
+//
+
+// clang-format off
+#extension GL_GOOGLE_include_directive             : require
+#extension GL_EXT_control_flow_attributes          : require
+#extension GL_KHR_shader_subgroup_basic            : require
+// clang-format on
+
+//
+//
+//
+#include "config.h"
+
+//
+// Optional switches:
+//
+//   #define RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT
+//   #define RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
+//
+
+//
+// Buffer reference macros and push constants
+//
+#include "bufref.h"
+#include "push.h"
+
+//
+// Push constants for histogram shader
+//
+RS_STRUCT_PUSH_HISTOGRAM();
+
+layout(push_constant) uniform block_push
+{
+  rs_push_histogram push;
+};
+
+//
+// Subgroup uniform support
+//
+#if defined(RS_HISTOGRAM_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
+#extension GL_EXT_subgroupuniform_qualifier : required
+#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
+#else
+#define RS_SUBGROUP_UNIFORM
+#endif
+
+//
+// Check all switches are defined
+//
+
+// What's the size of the keyval?
+#ifndef RS_KEYVAL_DWORDS
+#error "Undefined: RS_KEYVAL_DWORDS"
+#endif
+
+//
+#ifndef RS_HISTOGRAM_BLOCK_ROWS
+#error "Undefined: RS_HISTOGRAM_BLOCK_ROWS"
+#endif
+
+//
+#ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2
+#error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2"
+#endif
+
+//
+#ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2
+#error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2"
+#endif
+
+//
+// Local macros
+//
+// clang-format off
+#define RS_WORKGROUP_SIZE       (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+#define RS_SUBGROUP_SIZE        (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
+#define RS_BLOCK_KEYVALS        (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE)
+#define RS_KEYVAL_SIZE          (RS_KEYVAL_DWORDS * 4)
+#define RS_RADIX_MASK           ((1 << RS_RADIX_LOG2) - 1)
+// clang-format on
+
+//
+// Keyval type
+//
+#if (RS_KEYVAL_DWORDS == 1)
+#define RS_KEYVAL_TYPE uint32_t
+#elif (RS_KEYVAL_DWORDS == 2)
+#define RS_KEYVAL_TYPE u32vec2
+#else
+#error "Unsupported RS_KEYVAL_DWORDS"
+#endif
+
+//
+// Histogram offset depends on number of workgroups.
+//
+#define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_)
+
+#if (RS_WORKGROUP_SUBGROUPS == 1)
+#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4)
+#else
+#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4)
+#endif
+
+//
+// Assumes (RS_RADIX_LOG2 == 8)
+//
+// Error if this ever changes
+//
+#if (RS_RADIX_LOG2 != 8)
+#error "(RS_RADIX_LOG2 != 8)"
+#endif
+
+//
+// Is bitfield extract faster?
+//
+#ifdef RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT
+//----------------------------------------------------------------------
+
+//
+// Extract a keyval digit
+//
+#if (RS_KEYVAL_DWORDS == 1)
+#define RS_KV_EXTRACT_DIGIT(kv_, pass_) bitfieldExtract(kv_, pass_ * RS_RADIX_LOG2, RS_RADIX_LOG2)
+#else
+#define RS_KV_EXTRACT_DIGIT(kv_, pass_)                                                            \
+  bitfieldExtract(kv_[pass_ / 4], (pass_ & 3) * RS_RADIX_LOG2, RS_RADIX_LOG2)
+#endif
+//----------------------------------------------------------------------
+#else
+//----------------------------------------------------------------------
+
+//
+// Extract a keyval digit
+//
+#if (RS_KEYVAL_DWORDS == 1)
+#define RS_KV_EXTRACT_DIGIT(kv_, pass_) ((kv_ >> (pass_ * RS_RADIX_LOG2)) & RS_RADIX_MASK)
+#else
+#define RS_KV_EXTRACT_DIGIT(kv_, pass_)                                                            \
+  ((kv_[pass_ / 4] >> ((pass_ & 3) * RS_RADIX_LOG2)) & RS_RADIX_MASK)
+#endif
+//----------------------------------------------------------------------
+#endif
+
+//
+//
+//
+#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
+
+struct rs_histogram_smem
+{
+  uint32_t histogram[RS_RADIX_SIZE];
+};
+
+shared rs_histogram_smem smem;
+
+#endif
+
+//
+//
+//
+layout(local_size_x = RS_WORKGROUP_SIZE) in;
+
+//
+//
+//
+layout(buffer_reference, std430) buffer buffer_rs_kv
+{
+  RS_KEYVAL_TYPE extent[];
+};
+
+layout(buffer_reference, std430) buffer buffer_rs_histograms
+{
+  uint32_t extent[];
+};
+
+//
+// Shared memory functions
+//
+#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
+
+//
+// NOTE: Must use same access pattern as rs_histogram_zero()
+//
+void
+rs_histogram_zero()
+{
+  //
+  // Zero SMEM histogram
+  //
+#if (RS_WORKGROUP_SUBGROUPS == 1)
+
+  const uint32_t smem_offset = gl_SubgroupInvocationID;
+
+  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  {
+    smem.histogram[smem_offset + ii] = 0;
+  }
+
+#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+
+  const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+  {
+    smem.histogram[smem_offset + ii] = 0;
+  }
+
+  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
+
+  if (smem_idx < RS_RADIX_SIZE)
+    {
+      smem.histogram[smem_idx] = 0;
+    }
+
+#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+
+#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
+  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
+#endif
+    {
+      smem.histogram[gl_LocalInvocationID.x] = 0;
+    }
+
+#endif
+}
+
+//
+// NOTE: Must use same access pattern as rs_histogram_zero()
+//
+void
+rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
+{
+  //
+  // Store to GMEM
+  //
+#if (RS_WORKGROUP_SUBGROUPS == 1)
+
+  const uint32_t smem_offset = gl_SubgroupInvocationID;
+
+  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
+  {
+    const uint32_t count = smem.histogram[smem_offset + ii];
+
+    atomicAdd(rs_histograms.extent[ii], count);
+  }
+
+#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
+
+  const uint32_t smem_offset = gl_LocalInvocationID.x;
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
+  {
+    const uint32_t count = smem.histogram[smem_offset + ii];
+
+    atomicAdd(rs_histograms.extent[ii], count);
+  }
+
+  const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
+
+  if (smem_idx < RS_RADIX_SIZE)
+    {
+      const uint32_t count = smem.histogram[smem_idx];
+
+      atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
+                count);
+    }
+
+#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
+
+#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
+  if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
+#endif
+    {
+      const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
+
+      atomicAdd(rs_histograms.extent[0], count);
+    }
+
+#endif
+}
+
+#endif
+
+//
+//
+//
+#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
+
+void
+rs_histogram_atomic_after_write()
+{
+#if (RS_WORKGROUP_SUBGROUPS == 1)
+  subgroupMemoryBarrierShared();
+#else
+  barrier();
+#endif
+}
+
+void
+rs_histogram_read_after_atomic()
+{
+#if (RS_WORKGROUP_SUBGROUPS == 1)
+  subgroupMemoryBarrierShared();
+#else
+  barrier();
+#endif
+}
+
+#endif
+
+//
+//
+//
+void
+main()
+{
+  //
+  // Which subgroups have work?
+  //
+  RS_KEYVAL_TYPE kv[RS_HISTOGRAM_BLOCK_ROWS];
+
+  //
+  // Define kv_in bufref
+  //
+  // Assumes less than 2^30-1 keys and then extended multiplies it
+  // by the keyval size.
+  //
+  u32vec2 kv_in_offset;
+
+  umulExtended(gl_WorkGroupID.x * RS_BLOCK_KEYVALS + gl_LocalInvocationID.x,
+               RS_KEYVAL_SIZE,
+               kv_in_offset.y,   // msb
+               kv_in_offset.x);  // lsb
+
+  readonly RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_kv,
+                                              rs_kv_in,
+                                              push.devaddr_keyvals,
+                                              kv_in_offset);
+
+  //
+  // Load keyvals
+  //
+  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_HISTOGRAM_BLOCK_ROWS; ii++)
+  {
+    kv[ii] = rs_kv_in.extent[ii * RS_WORKGROUP_SIZE];
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // Accumulate and store histograms for passes
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  // MACRO EXPANSION VARIANT
+  //
+  // NOTE: THIS ALSO SERVES AS A MALI R24+ WORKAROUND: EXPLICITLY
+  // EXPAND THE FOR/LOOP PASSES
+  //
+#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
+
+#define RS_HISTOGRAM_PASS(pass_)                                                                   \
+  rs_histogram_zero();                                                                             \
+                                                                                                   \
+  rs_histogram_atomic_after_write();                                                               \
+                                                                                                   \
+  [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++)         \
+  {                                                                                                \
+    const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_);                                     \
+                                                                                                   \
+    atomicAdd(smem.histogram[digit], 1);                                                           \
+  }                                                                                                \
+                                                                                                   \
+  rs_histogram_read_after_atomic();                                                                \
+                                                                                                   \
+  {                                                                                                \
+    const uint32_t rs_histogram_offset = RS_HISTOGRAM_OFFSET(pass_);                               \
+                                                                                                   \
+    RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,                                        \
+                                      rs_histograms,                                               \
+                                      push.devaddr_histograms,                                     \
+                                      rs_histogram_offset);                                        \
+                                                                                                   \
+    rs_histogram_global_store(rs_histograms);                                                      \
+  }                                                                                                \
+                                                                                                   \
+  if (push.passes == (RS_KEYVAL_SIZE - pass_))                                                     \
+    return;
+
+#else  // NO SHARED MEMORY
+
+#define RS_HISTOGRAM_PASS(pass_)                                                                   \
+  {                                                                                                \
+    const uint32_t rs_histogram_base = RS_HISTOGRAM_BASE(pass_);                                   \
+                                                                                                   \
+    RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,                                        \
+                                      rs_histograms,                                               \
+                                      push.devaddr_histograms,                                     \
+                                      rs_histogram_base);                                          \
+                                                                                                   \
+    [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++)       \
+    {                                                                                              \
+      const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_);                                   \
+                                                                                                   \
+      atomicAdd(rs_histograms.extent[digit], 1);                                                   \
+    }                                                                                              \
+  }                                                                                                \
+                                                                                                   \
+  if (push.passes == (RS_KEYVAL_SIZE - pass_))                                                     \
+    return;
+
+#endif
+
+#if (RS_KEYVAL_DWORDS == 1)
+
+  RS_HISTOGRAM_PASS(3)
+  RS_HISTOGRAM_PASS(2)
+  RS_HISTOGRAM_PASS(1)
+  RS_HISTOGRAM_PASS(0)
+
+#elif (RS_KEYVAL_DWORDS == 2)
+
+  RS_HISTOGRAM_PASS(7)
+  RS_HISTOGRAM_PASS(6)
+  RS_HISTOGRAM_PASS(5)
+  RS_HISTOGRAM_PASS(4)
+  RS_HISTOGRAM_PASS(3)
+  RS_HISTOGRAM_PASS(2)
+  RS_HISTOGRAM_PASS(1)
+  RS_HISTOGRAM_PASS(0)
+
+#else
+#error "Error: (RS_KEYVAL_DWORDS >= 3) not implemented."
+#endif
+}
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/shaders/init.comp
+++ b/src/amd/vulkan/radix_sort/shaders/init.comp
@ -0,0 +1,168 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+// Initialize the `rs_indirect_info` struct
+//
+
+// clang-format off
+#extension GL_GOOGLE_include_directive    : require
+#extension GL_EXT_control_flow_attributes : require
+// clang-format on
+
+//
+// Load arch/keyval configuration
+//
+#include "config.h"
+
+//
+// Buffer reference macros and push constants
+//
+#include "bufref.h"
+#include "push.h"
+
+//
+// Subgroup uniform support
+//
+#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
+#extension GL_EXT_subgroupuniform_qualifier : required
+#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
+#else
+#define RS_SUBGROUP_UNIFORM
+#endif
+
+//
+// Declare the push constants
+//
+RS_STRUCT_PUSH_INIT();
+
+layout(push_constant) uniform block_push
+{
+  rs_push_init push;
+};
+
+//
+// The "init" shader configures the fill info structure.
+//
+RS_STRUCT_INDIRECT_INFO();
+
+//
+// Local macros
+//
+// clang-format off
+#define RS_FILL_WORKGROUP_SIZE        (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
+#define RS_SCATTER_WORKGROUP_SIZE     (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
+#define RS_HISTOGRAM_WORKGROUP_SIZE   (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
+
+#define RS_FILL_BLOCK_DWORDS          (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
+#define RS_SCATTER_BLOCK_KEYVALS      (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
+#define RS_HISTOGRAM_BLOCK_KEYVALS    (RS_HISTOGRAM_BLOCK_ROWS * RS_HISTOGRAM_WORKGROUP_SIZE)
+// clang-format on
+
+//
+//
+//
+layout(local_size_x = 1) in;
+
+//
+//
+//
+layout(buffer_reference, std430) buffer buffer_rs_count
+{
+  uint32_t count;
+};
+
+layout(buffer_reference, std430) buffer buffer_rs_indirect_info
+{
+  rs_indirect_info info;
+};
+
+//
+// Helper macros
+//
+// RU = Round Up
+// RD = Round Down
+//
+#define RS_COUNT_RU_BLOCKS(count_, block_size_) ((count_ + block_size_ - 1) / (block_size_))
+#define RS_COUNT_RD_BLOCKS(count_, block_size_) ((count_) / (block_size_))
+
+//
+//
+//
+void
+main()
+{
+  //
+  // Load the keyval count
+  //
+  readonly RS_BUFREF_DEFINE(buffer_rs_count, rs_count, push.devaddr_count);
+
+  RS_SUBGROUP_UNIFORM const uint32_t count = rs_count.count;
+
+  //
+  // Define the init struct bufref
+  //
+  writeonly RS_BUFREF_DEFINE(buffer_rs_indirect_info, rs_indirect_info, push.devaddr_info);
+
+  //
+  // Size and set scatter dispatch
+  //
+  const uint32_t scatter_ru_blocks = RS_COUNT_RU_BLOCKS(count, RS_SCATTER_BLOCK_KEYVALS);
+  const uint32_t count_ru_scatter  = scatter_ru_blocks * RS_SCATTER_BLOCK_KEYVALS;
+
+  rs_indirect_info.info.dispatch.scatter = u32vec4(scatter_ru_blocks, 1, 1, 0);
+
+  //
+  // Size and set histogram dispatch
+  //
+  const uint32_t histo_ru_blocks = RS_COUNT_RU_BLOCKS(count_ru_scatter, RS_HISTOGRAM_BLOCK_KEYVALS);
+  const uint32_t count_ru_histo  = histo_ru_blocks * RS_HISTOGRAM_BLOCK_KEYVALS;
+
+  rs_indirect_info.info.dispatch.histogram = u32vec4(histo_ru_blocks, 1, 1, 0);
+
+  //
+  // Size and set pad fill and dispatch
+  //
+  const uint32_t count_dwords          = count * RS_KEYVAL_DWORDS;
+  const uint32_t pad_rd_blocks         = RS_COUNT_RD_BLOCKS(count_dwords, RS_FILL_BLOCK_DWORDS);
+  const uint32_t count_rd_pad          = pad_rd_blocks * RS_FILL_BLOCK_DWORDS;
+  const uint32_t count_ru_histo_dwords = count_ru_histo * RS_KEYVAL_DWORDS;
+  const uint32_t pad_dwords            = count_ru_histo_dwords - count_rd_pad;
+  const uint32_t pad_ru_blocks         = RS_COUNT_RU_BLOCKS(pad_dwords, RS_FILL_BLOCK_DWORDS);
+
+  rs_indirect_info_fill pad;
+
+  pad.block_offset               = pad_rd_blocks;
+  pad.dword_offset_min           = count_dwords;
+  pad.dword_offset_max_minus_min = count_ru_histo_dwords - count_dwords;
+
+  rs_indirect_info.info.pad          = pad;
+  rs_indirect_info.info.dispatch.pad = u32vec4(pad_ru_blocks, 1, 1, 0);
+
+  //
+  // Size and set zero fill and dispatch
+  //
+  // NOTE(allanmac): We could zero the histogram passes on the host
+  // since the number of passes is known ahead of time but since the
+  // 256-dword partitions directly follow the 256-dword histograms, we
+  // can dispatch just one FILL.
+  //
+  rs_indirect_info_fill zero;
+
+  zero.block_offset               = 0;
+  zero.dword_offset_min           = 0;
+  zero.dword_offset_max_minus_min = (push.passes + scatter_ru_blocks - 1) * RS_RADIX_SIZE;
+
+  const uint32_t zero_ru_blocks =
+    RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, RS_FILL_BLOCK_DWORDS);
+
+  rs_indirect_info.info.zero          = zero;
+  rs_indirect_info.info.dispatch.zero = u32vec4(zero_ru_blocks, 1, 1, 0);
+}
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/shaders/meson.build
+++ b/src/amd/vulkan/radix_sort/shaders/meson.build
@ -0,0 +1,51 @@
+# Copyright © 2022 Konstantin Seurer
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+radix_sort_shaders = [
+  'init.comp',
+  'fill.comp',
+  'histogram.comp',
+  'prefix.comp',
+  'scatter_0_even.comp',
+  'scatter_0_odd.comp',
+  'scatter_1_even.comp',
+  'scatter_1_odd.comp'
+]
+
+shader_include_dir = meson.source_root() + '/src/amd/vulkan/radix_sort/targets/u64'
+
+shader_include_files = files(
+  'bufref.h',
+  'prefix_limits.h',
+  'prefix.h',
+  'push.h',
+  'scatter.glsl',
+  meson.source_root() + '/src/amd/vulkan/radix_sort/targets/u64/config.h'
+)
+
+radix_sort_spv = []
+foreach s : radix_sort_shaders
+  radix_sort_spv += custom_target(
+    s + '.spv.h',
+    input : s,
+    output : s + '.spv.h',
+    command : [prog_glslang, '-V', '-I' + shader_include_dir, '--target-env', 'spirv1.3', '-x', '-o', '@OUTPUT@', '@INPUT@'],
+    depend_files: shader_include_files)
+endforeach
--- a/src/amd/vulkan/radix_sort/shaders/prefix.comp
+++ b/src/amd/vulkan/radix_sort/shaders/prefix.comp
@ -0,0 +1,194 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+// Prefix sum the coarse histograms.
+//
+
+// clang-format off
+#extension GL_GOOGLE_include_directive             : require
+#extension GL_EXT_control_flow_attributes          : require
+#extension GL_KHR_shader_subgroup_arithmetic       : require
+#extension GL_KHR_shader_subgroup_ballot           : require
+// clang-format on
+
+//
+//
+//
+#include "config.h"
+
+//
+// Buffer reference macros and push constants
+//
+#include "bufref.h"
+#include "push.h"
+
+//
+//
+//
+RS_STRUCT_PUSH_PREFIX();
+
+layout(push_constant) uniform block_push
+{
+  rs_push_prefix push;
+};
+
+//
+// Subgroup uniform support
+//
+#if defined(RS_HISTOGRAM_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
+#extension GL_EXT_subgroupuniform_qualifier : required
+#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
+#else
+#define RS_SUBGROUP_UNIFORM
+#endif
+
+//
+// Check all switches are defined
+//
+//
+#ifndef RS_PREFIX_SUBGROUP_SIZE_LOG2
+#error "Undefined: RS_PREFIX_SUBGROUP_SIZE_LOG2"
+#endif
+
+//
+#ifndef RS_PREFIX_WORKGROUP_SIZE_LOG2
+#error "Undefined: RS_PREFIX_WORKGROUP_SIZE_LOG2"
+#endif
+
+//
+// Local macros
+//
+// clang-format off
+#define RS_KEYVAL_SIZE          (RS_KEYVAL_DWORDS * 4)
+#define RS_WORKGROUP_SIZE       (1 << RS_PREFIX_WORKGROUP_SIZE_LOG2)
+#define RS_SUBGROUP_SIZE        (1 << RS_PREFIX_SUBGROUP_SIZE_LOG2)
+#define RS_WORKGROUP_SUBGROUPS  (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
+// clang-format on
+
+//
+// There is no purpose in having a workgroup size larger than the
+// radix size.
+//
+#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
+#error "Error: (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)"
+#endif
+
+//
+//
+//
+layout(local_size_x = RS_WORKGROUP_SIZE) in;
+
+//
+// Histogram buffer reference
+//
+layout(buffer_reference, std430) buffer buffer_rs_histograms
+{
+  uint32_t extent[];
+};
+
+//
+// Load prefix limits before loading function
+//
+#include "prefix_limits.h"
+
+//
+// If multi-subgroup then define shared memory
+//
+#if (RS_WORKGROUP_SUBGROUPS > 1)
+
+//----------------------------------------
+shared uint32_t smem_sweep0[RS_SWEEP_0_SIZE];
+
+#define RS_PREFIX_SWEEP0(idx_) smem_sweep0[idx_]
+//----------------------------------------
+
+#if (RS_SWEEP_1_SIZE > 0)
+//----------------------------------------
+shared uint32_t smem_sweep1[RS_SWEEP_1_SIZE];
+
+#define RS_PREFIX_SWEEP1(idx_) smem_sweep1[idx_]
+//----------------------------------------
+#endif
+
+#if (RS_SWEEP_2_SIZE > 0)
+//----------------------------------------
+shared uint32_t smem_sweep2[RS_SWEEP_2_SIZE];
+
+#define RS_PREFIX_SWEEP2(idx_) smem_sweep2[idx_]
+//----------------------------------------
+#endif
+
+#endif
+
+//
+// Define function arguments
+//
+#define RS_PREFIX_ARGS buffer_rs_histograms rs_histograms
+
+//
+// Define load/store functions
+//
+// clang-format off
+#define RS_PREFIX_LOAD(idx_)   rs_histograms.extent[idx_]
+#define RS_PREFIX_STORE(idx_)  rs_histograms.extent[idx_]
+// clang-format on
+
+//
+// Load prefix function
+//
+#include "prefix.h"
+
+//
+// Exclusive prefix of uint32_t[256]
+//
+void
+main()
+{
+  //
+  // Define buffer reference to read histograms
+  //
+#if (RS_WORKGROUP_SUBGROUPS == 1)
+  //
+  // Define histograms bufref for single subgroup
+  //
+  // NOTE(allanmac): The histogram buffer reference could be adjusted
+  // on the host to save a couple instructions at the cost of added
+  // complexity.
+  //
+  RS_SUBGROUP_UNIFORM
+  const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
+  const uint32_t histograms_offset = (histograms_base + gl_SubgroupInvocationID) * 4;
+
+  RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
+                                    rs_histograms,
+                                    push.devaddr_histograms,
+                                    histograms_offset);
+
+#else
+  //
+  // Define histograms bufref for workgroup
+  //
+  RS_SUBGROUP_UNIFORM
+  const uint32_t histograms_base   = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
+  const uint32_t histograms_offset = (histograms_base + gl_LocalInvocationID.x) * 4;
+
+  RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
+                                    rs_histograms,
+                                    push.devaddr_histograms,
+                                    histograms_offset);
+
+#endif
+
+  //
+  // Compute exclusive prefix of uint32_t[256]
+  //
+  rs_prefix(rs_histograms);
+}
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/shaders/prefix.h
+++ b/src/amd/vulkan/radix_sort/shaders/prefix.h
@ -0,0 +1,353 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
+
+//
+// Requires several defines
+//
+#ifndef RS_PREFIX_LIMITS
+#error "Error: \"prefix_limits.h\" not loaded"
+#endif
+
+#ifndef RS_PREFIX_ARGS
+#error "Error: RS_PREFIX_ARGS undefined"
+#endif
+
+#ifndef RS_PREFIX_LOAD
+#error "Error: RS_PREFIX_LOAD undefined"
+#endif
+
+#ifndef RS_PREFIX_STORE
+#error "Error: RS_PREFIX_STORE undefined"
+#endif
+
+#ifndef RS_SUBGROUP_SIZE
+#error "Error: RS_SUBGROUP_SIZE undefined"
+#endif
+
+#ifndef RS_WORKGROUP_SIZE
+#error "Error: RS_WORKGROUP_SIZE undefined"
+#endif
+
+#ifndef RS_WORKGROUP_SUBGROUPS
+#error "Error: RS_WORKGROUP_SUBGROUPS undefined"
+#endif
+
+//
+// Optional switches:
+//
+//   * Disable holding original inclusively scanned histogram values in registers.
+//
+//     #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+//
+
+//
+// Compute exclusive prefix of uint32_t[256]
+//
+void
+rs_prefix(RS_PREFIX_ARGS)
+{
+#if (RS_WORKGROUP_SUBGROUPS == 1)
+  //
+  // Workgroup is a single subgroup so no shared memory is required.
+  //
+
+  //
+  // Exclusive scan-add the histogram
+  //
+  const uint32_t               h0     = RS_PREFIX_LOAD(0);
+  const uint32_t               h0_inc = subgroupInclusiveAdd(h0);
+  RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+
+  RS_PREFIX_STORE(0) = h0_inc - h0;  // exclusive
+
+  //
+  // Each iteration is dependent on the previous so no unrolling.  The
+  // compiler is free to hoist the loads upward though.
+  //
+  for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE;  //
+       ii < RS_RADIX_SIZE;
+       ii += RS_SUBGROUP_SIZE)
+    {
+      const uint32_t h     = RS_PREFIX_LOAD(ii);
+      const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
+      h_last               = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+      RS_PREFIX_STORE(ii) = h_inc - h;  // exclusive
+    }
+
+#else
+  //
+  // Workgroup is multiple subgroups and uses shared memory to store
+  // the scan's intermediate results.
+  //
+  // Assumes a power-of-two subgroup, workgroup and radix size.
+  //
+  // Downsweep: Repeatedly scan reductions until they fit in a single
+  //            subgroup.
+  //
+  // Upsweep:   Then uniformly apply reductions to each subgroup.
+  //
+  //
+  //   Subgroup Size |  4 |  8 | 16 | 32 | 64 |
+  //   --------------+----+----+----+----+----+
+  //   Sweep 0       | 64 | 32 | 16 |  8 |  4 | sweep_0[]
+  //   Sweep 1       | 16 |  4 |  - |  - |  - | sweep_1[]
+  //   Sweep 2       |  4 |  - |  - |  - |  - | sweep_2[]
+  //   --------------+----+----+----+----+----+
+  //   Total dwords  | 84 | 36 | 16 |  8 |  4 |
+  //   --------------+----+----+----+----+----+
+  //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+  uint32_t h_exc[RS_H_COMPONENTS];
+#endif
+
+  //
+  // Downsweep 0
+  //
+  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+  {
+    const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+    const uint32_t h_inc = subgroupInclusiveAdd(h);
+
+    const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+    RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
+
+    //
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+    h_exc[ii] = h_inc - h;
+#else
+    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
+#endif
+  }
+
+  barrier();
+
+  //
+  // Skip generalizing these sweeps for all possible subgroups -- just
+  // write them directly.
+  //
+#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
+
+  //////////////////////////////////////////////////////////////////////
+  //
+  // Scan 0
+  //
+#if (RS_SWEEP_0_SIZE != RS_SUBGROUP_SIZE)
+  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // subgroup has inactive invocations
+#endif
+    {
+      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+    }
+
+#elif (RS_SUBGROUP_SIZE == 8)
+
+#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+
+  //////////////////////////////////////////////////////////////////////
+  //
+  // Scan 0 and Downsweep 1
+  //
+  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 32 invocations
+    {
+      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+      RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+    }
+
+#else
+
+  //////////////////////////////////////////////////////////////////////
+  //
+  // Scan 0 and Downsweep 1
+  //
+  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 32 invocations
+  {
+    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+  }
+
+#endif
+
+  barrier();
+
+  //
+  // Scan 1
+  //
+  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 4 invocations
+    {
+      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+    }
+
+#elif (RS_SUBGROUP_SIZE == 4)
+
+  //////////////////////////////////////////////////////////////////////
+  //
+  // Scan 0 and Downsweep 1
+  //
+#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
+
+  if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE)  // 64 invocations
+    {
+      const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
+      const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+      RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
+      RS_PREFIX_SWEEP1(gl_SubgroupID)          = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+    }
+
+#else
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++)  // 64 invocations
+  {
+    const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+    const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+    const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
+    const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
+
+    RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
+    RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
+  }
+#endif
+
+  barrier();
+
+  //
+  // Scan 1 and Downsweep 2
+  //
+#if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
+  if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE)  // 16 invocations
+    {
+      const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
+      const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+      RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
+      RS_PREFIX_SWEEP2(gl_SubgroupID)          = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+    }
+
+#else
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++)  // 16 invocations
+  {
+    const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
+    const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+    const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
+    const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
+
+    RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
+    RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
+  }
+
+#endif
+
+  barrier();
+
+  //
+  // Scan 2
+  //
+  // 4 invocations
+  //
+  if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
+    {
+      const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
+      const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
+
+      RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
+    }
+
+#else
+#error "Error: Unsupported subgroup size"
+#endif
+
+  barrier();
+
+  //////////////////////////////////////////////////////////////////////
+  //
+  // Final upsweep 0
+  //
+#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+  {
+    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+
+    // clang format issue
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
+#else
+    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
+#endif
+  }
+
+#elif (RS_SUBGROUP_SIZE == 8)
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+  {
+    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+      h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#else
+    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+      h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
+#endif
+  }
+
+#elif (RS_SUBGROUP_SIZE == 4)
+
+  [[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
+  {
+    const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
+    const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
+    const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
+
+#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
+    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+      h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#else
+    const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
+
+    RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
+      h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
+#endif
+  }
+
+#else
+#error "Error: Unsupported subgroup size"
+#endif
+
+#endif
+}
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
--- a/src/amd/vulkan/radix_sort/shaders/prefix_limits.h
+++ b/src/amd/vulkan/radix_sort/shaders/prefix_limits.h
@ -0,0 +1,48 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
+
+//
+// Define various prefix limits
+//
+#define RS_PREFIX_LIMITS
+
+//
+// Multi-subgroup prefix requires shared memory.
+//
+#if (RS_WORKGROUP_SUBGROUPS > 1)
+
+// clang-format off
+#define RS_H_COMPONENTS    (RS_RADIX_SIZE / RS_WORKGROUP_SIZE)
+
+#define RS_SWEEP_0_SIZE    (RS_RADIX_SIZE   / RS_SUBGROUP_SIZE)
+#define RS_SWEEP_1_SIZE    (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE)
+#define RS_SWEEP_2_SIZE    (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE)
+
+#define RS_SWEEP_SIZE      (RS_SWEEP_0_SIZE + RS_SWEEP_1_SIZE + RS_SWEEP_2_SIZE)
+
+#define RS_S0_PASSES       (RS_SWEEP_0_SIZE / RS_WORKGROUP_SIZE)
+#define RS_S1_PASSES       (RS_SWEEP_1_SIZE / RS_WORKGROUP_SIZE)
+
+#define RS_SWEEP_0_OFFSET  0
+#define RS_SWEEP_1_OFFSET  (RS_SWEEP_0_OFFSET + RS_SWEEP_0_SIZE)
+#define RS_SWEEP_2_OFFSET  (RS_SWEEP_1_OFFSET + RS_SWEEP_1_SIZE)
+// clang-format on
+
+//
+// Single subgroup prefix doesn't use shared memory.
+//
+#else
+
+#define RS_SWEEP_SIZE 0
+
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
--- a/src/amd/vulkan/radix_sort/shaders/push.h
+++ b/src/amd/vulkan/radix_sort/shaders/push.h
@ -0,0 +1,263 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
+
+//
+// There is a limit to the maximum number of keyvals that can be sorted because
+// the top 2 bits in the atomic lookback counters are used as tag bits.
+//
+#define RS_MAX_KEYVALS ((1 << 30) - 1)
+
+//
+// Right now, the entire implementation is very much dependent on an 8-bit radix
+// size.  Most of the shaders attempt to honor this defined size but there are
+// still a number of places where 256 is assumed.
+//
+#define RS_RADIX_LOG2 8
+#define RS_RADIX_SIZE (1 << RS_RADIX_LOG2)
+
+//
+// LOOKBACK STATUS FLAGS
+//
+// The decoupled lookback status flags are stored in the two
+// high bits of the count:
+//
+//   0                                   31
+//   | REDUCTION OR PREFIX COUNT | STATUS |
+//   +---------------------------+--------+
+//   |             30            |    2   |
+//
+// This limits the keyval extent size to (2^30-1).
+//
+// Valid status flags are:
+//
+//   EVEN PASS                 ODD PASS
+//   -----------------------   -----------------------
+//   0 : invalid               0 : prefix available
+//   1 : reduction available   1 : ---
+//   2 : prefix available      2 : invalid
+//   3 : ---                   3 : reduction available
+//
+// Atomically adding +1 to a "reduction available" status results in a "prefix
+// available" status.
+//
+// clang-format off
+#define RS_PARTITION_STATUS_EVEN_INVALID    (0u)
+#define RS_PARTITION_STATUS_EVEN_REDUCTION  (1u)
+#define RS_PARTITION_STATUS_EVEN_PREFIX     (2u)
+
+#define RS_PARTITION_STATUS_ODD_INVALID     (2u)
+#define RS_PARTITION_STATUS_ODD_REDUCTION   (3u)
+#define RS_PARTITION_STATUS_ODD_PREFIX      (0u)
+// clang-format on
+
+//
+// Arguments to indirectly launched shaders.
+//
+//   struct rs_indirect_info_dispatch
+//   {
+//     u32vec4 pad;
+//     u32vec4 zero;
+//     u32vec4 histogram;
+//     u32vec4 scatter;
+//   };
+//
+//   struct rs_indirect_info_fill
+//   {
+//     uint32_t block_offset;
+//     uint32_t dword_offset_min;
+//     uint32_t dword_offset_max_minus_min;
+//     uint32_t reserved; // padding for 16 bytes
+//   };
+//
+//   struct rs_indirect_info
+//   {
+//     rs_indirect_info_fill     pad;
+//     rs_indirect_info_fill     zero;
+//     rs_indirect_info_dispatch dispatch;
+//   };
+//
+#define RS_STRUCT_INDIRECT_INFO_DISPATCH()                                                         \
+  struct rs_indirect_info_dispatch                                                                 \
+  {                                                                                                \
+    RS_STRUCT_MEMBER_STRUCT(u32vec4, pad)                                                          \
+    RS_STRUCT_MEMBER_STRUCT(u32vec4, zero)                                                         \
+    RS_STRUCT_MEMBER_STRUCT(u32vec4, histogram)                                                    \
+    RS_STRUCT_MEMBER_STRUCT(u32vec4, scatter)                                                      \
+  }
+
+#define RS_STRUCT_INDIRECT_INFO_FILL()                                                             \
+  struct rs_indirect_info_fill                                                                     \
+  {                                                                                                \
+    RS_STRUCT_MEMBER(uint32_t, block_offset)                                                       \
+    RS_STRUCT_MEMBER(uint32_t, dword_offset_min)                                                   \
+    RS_STRUCT_MEMBER(uint32_t, dword_offset_max_minus_min)                                         \
+    RS_STRUCT_MEMBER(uint32_t, reserved)                                                           \
+  }
+
+#define RS_STRUCT_INDIRECT_INFO()                                                                  \
+  RS_STRUCT_INDIRECT_INFO_DISPATCH();                                                              \
+  RS_STRUCT_INDIRECT_INFO_FILL();                                                                  \
+  struct rs_indirect_info                                                                          \
+  {                                                                                                \
+    RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_fill, pad)                                            \
+    RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_fill, zero)                                           \
+    RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_dispatch, dispatch)                                   \
+  }
+
+//
+// Define the push constant structures shared by the host and device.
+//
+//   INIT
+//   ----
+//   struct rs_push_init
+//   {
+//     uint64_t devaddr_count;         // address of count buffer
+//     uint64_t devaddr_indirect;      // address of indirect info buffer
+//   };
+//
+//   FILL
+//   ----
+//   struct rs_push_fill
+//   {
+//     uint64_t devaddr_info;          // address of indirect info for fill shader
+//     uint64_t devaddr_dwords;        // address of dwords extent
+//     uint32_t dword;                 // dword value used to fill the dwords extent
+//   };
+//
+//   HISTOGRAM
+//   ---------
+//   struct rs_push_histogram
+//   {
+//     uint64_t devaddr_histograms;    // address of histograms extent
+//     uint64_t devaddr_keyvals;       // address of keyvals extent
+//     uint32_t passes;                // number of passes
+//   };
+//
+//   PREFIX
+//   ------
+//   struct rs_push_prefix
+//   {
+//     uint64_t devaddr_histograms;    // address of histograms extent
+//   };
+//
+//   SCATTER
+//   -------
+//   struct rs_push_scatter
+//   {
+//     uint64_t devaddr_keyvals_in;    // address of input keyvals
+//     uint64_t devaddr_keyvals_out;   // address of output keyvals
+//     uint64_t devaddr_partitions     // address of partitions
+//     uint64_t devaddr_histogram;     // address of pass histogram
+//     uint32_t pass_offset;           // keyval pass offset
+//   };
+//
+#define RS_STRUCT_PUSH_INIT()                                                                      \
+  struct rs_push_init                                                                              \
+  {                                                                                                \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_info)                                                     \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_count)                                                    \
+    RS_STRUCT_MEMBER(uint32_t, passes)                                                             \
+  }
+
+#define RS_STRUCT_PUSH_FILL()                                                                      \
+  struct rs_push_fill                                                                              \
+  {                                                                                                \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_info)                                                     \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_dwords)                                                   \
+    RS_STRUCT_MEMBER(uint32_t, dword)                                                              \
+  }
+
+#define RS_STRUCT_PUSH_HISTOGRAM()                                                                 \
+  struct rs_push_histogram                                                                         \
+  {                                                                                                \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms)                                               \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals)                                                  \
+    RS_STRUCT_MEMBER(uint32_t, passes)                                                             \
+  }
+
+#define RS_STRUCT_PUSH_PREFIX()                                                                    \
+  struct rs_push_prefix                                                                            \
+  {                                                                                                \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms)                                               \
+  }
+
+#define RS_STRUCT_PUSH_SCATTER()                                                                   \
+  struct rs_push_scatter                                                                           \
+  {                                                                                                \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals_even)                                             \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals_odd)                                              \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_partitions)                                               \
+    RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms)                                               \
+    RS_STRUCT_MEMBER(uint32_t, pass_offset)                                                        \
+  }
+
+////////////////////////////////////////////////////////////////////
+//
+// GLSL
+//
+#ifdef VULKAN  // defined by GLSL/VK compiler
+
+// clang-format off
+#define RS_STRUCT_MEMBER(type_, name_)               type_ name_;
+#define RS_STRUCT_MEMBER_FARRAY(type_, len_, name_)  type_ name_[len_];
+#define RS_STRUCT_MEMBER_STRUCT(type_, name_)        type_ name_;
+// clang-format on
+
+////////////////////////////////////////////////////////////////////
+//
+// C/C++
+//
+#else
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+//
+//
+
+#include <stdint.h>
+
+struct u32vec4
+{
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+  uint32_t w;
+};
+
+// clang-format off
+#define RS_DEVADDR                                   uint64_t
+#define RS_STRUCT_MEMBER(type_, name_)               type_ name_;
+#define RS_STRUCT_MEMBER_FARRAY(type_, len_, name_)  type_ name_[len_];
+#define RS_STRUCT_MEMBER_STRUCT(type_, name_)        struct type_ name_;
+// clang-format on
+
+RS_STRUCT_PUSH_INIT();
+RS_STRUCT_PUSH_FILL();
+RS_STRUCT_PUSH_HISTOGRAM();
+RS_STRUCT_PUSH_PREFIX();
+RS_STRUCT_PUSH_SCATTER();
+
+RS_STRUCT_INDIRECT_INFO();
+
+//
+//
+//
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
--- a/src/amd/vulkan/radix_sort/shaders/scatter.glsl
+++ b/src/amd/vulkan/radix_sort/shaders/scatter.glsl
--- a/src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp
+++ b/src/amd/vulkan/radix_sort/shaders/scatter_0_even.comp
@ -0,0 +1,36 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+//
+//
+
+// clang-format off
+#define RS_SCATTER_KEYVAL_DWORD_BASE   0
+
+#define RS_PARTITION_STATUS_INVALID    RS_PARTITION_STATUS_EVEN_INVALID
+#define RS_PARTITION_STATUS_REDUCTION  RS_PARTITION_STATUS_EVEN_REDUCTION
+#define RS_PARTITION_STATUS_PREFIX     RS_PARTITION_STATUS_EVEN_PREFIX
+
+#define RS_DEVADDR_KEYVALS_IN(push_)   push_.devaddr_keyvals_even
+#define RS_DEVADDR_KEYVALS_OUT(push_)  push_.devaddr_keyvals_odd
+// clang-format on
+
+//
+//
+//
+
+#extension GL_GOOGLE_include_directive : require
+
+//
+//
+//
+
+#include "scatter.glsl"
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp
+++ b/src/amd/vulkan/radix_sort/shaders/scatter_0_odd.comp
@ -0,0 +1,36 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+//
+//
+
+// clang-format off
+#define RS_SCATTER_KEYVAL_DWORD_BASE   0
+
+#define RS_PARTITION_STATUS_INVALID    RS_PARTITION_STATUS_ODD_INVALID
+#define RS_PARTITION_STATUS_REDUCTION  RS_PARTITION_STATUS_ODD_REDUCTION
+#define RS_PARTITION_STATUS_PREFIX     RS_PARTITION_STATUS_ODD_PREFIX
+
+#define RS_DEVADDR_KEYVALS_IN(push_)   push_.devaddr_keyvals_odd
+#define RS_DEVADDR_KEYVALS_OUT(push_)  push_.devaddr_keyvals_even
+// clang-format on
+
+//
+//
+//
+
+#extension GL_GOOGLE_include_directive : require
+
+//
+//
+//
+
+#include "scatter.glsl"
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp
+++ b/src/amd/vulkan/radix_sort/shaders/scatter_1_even.comp
@ -0,0 +1,36 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+//
+//
+
+// clang-format off
+#define RS_SCATTER_KEYVAL_DWORD_BASE   1
+
+#define RS_PARTITION_STATUS_INVALID    RS_PARTITION_STATUS_EVEN_INVALID
+#define RS_PARTITION_STATUS_REDUCTION  RS_PARTITION_STATUS_EVEN_REDUCTION
+#define RS_PARTITION_STATUS_PREFIX     RS_PARTITION_STATUS_EVEN_PREFIX
+
+#define RS_DEVADDR_KEYVALS_IN(push_)   push_.devaddr_keyvals_even
+#define RS_DEVADDR_KEYVALS_OUT(push_)  push_.devaddr_keyvals_odd
+// clang-format on
+
+//
+//
+//
+
+#extension GL_GOOGLE_include_directive : require
+
+//
+//
+//
+
+#include "scatter.glsl"
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp
+++ b/src/amd/vulkan/radix_sort/shaders/scatter_1_odd.comp
@ -0,0 +1,36 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#version 460
+
+//
+//
+//
+
+// clang-format off
+#define RS_SCATTER_KEYVAL_DWORD_BASE   1
+
+#define RS_PARTITION_STATUS_INVALID    RS_PARTITION_STATUS_ODD_INVALID
+#define RS_PARTITION_STATUS_REDUCTION  RS_PARTITION_STATUS_ODD_REDUCTION
+#define RS_PARTITION_STATUS_PREFIX     RS_PARTITION_STATUS_ODD_PREFIX
+
+#define RS_DEVADDR_KEYVALS_IN(push_)   push_.devaddr_keyvals_odd
+#define RS_DEVADDR_KEYVALS_OUT(push_)  push_.devaddr_keyvals_even
+// clang-format on
+
+//
+//
+//
+
+#extension GL_GOOGLE_include_directive : require
+
+//
+//
+//
+
+#include "scatter.glsl"
+
+//
+//
+//
--- a/src/amd/vulkan/radix_sort/target.h
+++ b/src/amd/vulkan/radix_sort/target.h
@ -0,0 +1,57 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
+
+//
+//
+//
+
+#include <stdint.h>
+
+//
+// This structure packages target-specific configuration parameters.
+//
+
+struct radix_sort_vk_target_config
+{
+  uint32_t keyval_dwords;
+
+  struct
+  {
+    uint32_t workgroup_size_log2;
+  } init;
+
+  struct
+  {
+    uint32_t workgroup_size_log2;
+  } fill;
+
+  struct
+  {
+    uint32_t workgroup_size_log2;
+    uint32_t subgroup_size_log2;
+    uint32_t block_rows;
+  } histogram;
+
+  struct
+  {
+    uint32_t workgroup_size_log2;
+    uint32_t subgroup_size_log2;
+  } prefix;
+
+  struct
+  {
+    uint32_t workgroup_size_log2;
+    uint32_t subgroup_size_log2;
+    uint32_t block_rows;
+  } scatter;
+};
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
--- a/src/amd/vulkan/radix_sort/targets/u64/config.h
+++ b/src/amd/vulkan/radix_sort/targets/u64/config.h
@ -0,0 +1,34 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
+#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
+
+//
+//
+//
+
+// clang-format off
+#define RS_KEYVAL_DWORDS                   2
+
+#define RS_FILL_WORKGROUP_SIZE_LOG2        7
+#define RS_FILL_BLOCK_ROWS                 8
+
+#define RS_HISTOGRAM_WORKGROUP_SIZE_LOG2   8
+#define RS_HISTOGRAM_SUBGROUP_SIZE_LOG2    6
+#define RS_HISTOGRAM_BLOCK_ROWS            14
+
+#define RS_PREFIX_WORKGROUP_SIZE_LOG2      8
+#define RS_PREFIX_SUBGROUP_SIZE_LOG2       6
+
+#define RS_SCATTER_WORKGROUP_SIZE_LOG2     8
+#define RS_SCATTER_SUBGROUP_SIZE_LOG2      6
+#define RS_SCATTER_BLOCK_ROWS              14
+// clang-format on
+
+//
+//
+//
+
+#endif  // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_