radv: Add the fuchsia radix sort

Signed-off-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15648>
This commit is contained in:
Konstantin Seurer 2022-04-07 15:39:52 +02:00
parent 9c020b525b
commit 5d9ef0efb5
33 changed files with 6658 additions and 2 deletions

View File

@ -680,7 +680,7 @@ if with_gallium_d3d12 or with_microsoft_clc or with_microsoft_vk
endif
endif
if with_vulkan_overlay_layer or with_aco_tests
if with_vulkan_overlay_layer or with_aco_tests or with_amd_vk
prog_glslang = find_program('glslangValidator')
endif

View File

@ -120,6 +120,9 @@ if with_llvm
)
endif
subdir('radix_sort')
libradv_files += radix_sort_files
radv_deps = []
radv_flags = cc.get_supported_arguments(['-Wimplicit-fallthrough', '-Wshadow'])
@ -151,7 +154,7 @@ endif
libvulkan_radeon = shared_library(
'vulkan_radeon',
[libradv_files, radv_entrypoints, sha1_h],
[libradv_files, radv_entrypoints, sha1_h, radix_sort_spv],
vs_module_defs : vulkan_api_def,
include_directories : [
inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_amd, inc_amd_common, inc_amd_common_llvm, inc_compiler, inc_util,

View File

@ -0,0 +1,24 @@
Copyright 2019 The Fuchsia Authors.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,112 @@
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_
//
//
//
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
//
// clang-format off
//
#define ARRAY_LENGTH_MACRO(x_) (sizeof(x_)/sizeof(x_[0]))
#define OFFSETOF_MACRO(t_,m_) offsetof(t_,m_)
#define MEMBER_SIZE_MACRO(t_,m_) sizeof(((t_*)0)->m_)
//
// FIXME(allanmac):
//
// Consider providing typed min/max() functions:
//
// <type> [min|max]_<type>(a,b) { ; }
//
// But note we still need preprocessor-time min/max().
//
#define MAX_MACRO(t_,a_,b_) (((a_) > (b_)) ? (a_) : (b_))
#define MIN_MACRO(t_,a_,b_) (((a_) < (b_)) ? (a_) : (b_))
//
//
//
#define BITS_TO_MASK_MACRO(n_) (((uint32_t)1<<(n_))-1)
#define BITS_TO_MASK_64_MACRO(n_) (((uint64_t)1<<(n_))-1)
#define BITS_TO_MASK_AT_MACRO(n_,b_) (BITS_TO_MASK_MACRO(n_) <<(b_))
#define BITS_TO_MASK_AT_64_MACRO(n_,b_) (BITS_TO_MASK_64_MACRO(n_)<<(b_))
//
//
//
#define STRINGIFY_MACRO_2(a_) #a_
#define STRINGIFY_MACRO(a_) STRINGIFY_MACRO_2(a_)
//
//
//
#define CONCAT_MACRO_2(a_,b_) a_ ## b_
#define CONCAT_MACRO(a_,b_) CONCAT_MACRO_2(a_,b_)
//
// Round up/down
//
#define ROUND_DOWN_MACRO(v_,q_) (((v_) / (q_)) * (q_))
#define ROUND_UP_MACRO(v_,q_) ((((v_) + (q_) - 1) / (q_)) * (q_))
//
// Round up/down when q is a power-of-two.
//
#define ROUND_DOWN_POW2_MACRO(v_,q_) ((v_) & ~((q_) - 1))
#define ROUND_UP_POW2_MACRO(v_,q_) ROUND_DOWN_POW2_MACRO((v_) + (q_) - 1, q_)
//
//
//
#if defined (_MSC_VER) && !defined (__clang__)
#define STATIC_ASSERT_MACRO(c_,m_) static_assert(c_,m_)
#else
#define STATIC_ASSERT_MACRO(c_,m_) _Static_assert(c_,m_)
#endif
#define STATIC_ASSERT_MACRO_1(c_) STATIC_ASSERT_MACRO(c_,#c_)
//
//
//
#if defined (_MSC_VER) && !defined (__clang__)
#define POPCOUNT_MACRO(...) __popcnt(__VA_ARGS__)
#else
#define POPCOUNT_MACRO(...) __builtin_popcount(__VA_ARGS__)
#endif
//
//
//
#if defined (_MSC_VER) && !defined (__clang__)
#define ALIGN_MACRO(bytes_) __declspec(align(bytes_)) // only accepts integer as arg
#else
#include <stdalign.h>
#define ALIGN_MACRO(bytes_) alignas(bytes_)
#endif
//
// clang-format on
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_MACROS_H_

View File

@ -0,0 +1,90 @@
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "util.h"
#include <assert.h>
//
//
//
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
#endif
//
//
//
bool
is_pow2_u32(uint32_t n)
{
return n && !(n & (n - 1));
}
//
//
//
uint32_t
pow2_ru_u32(uint32_t n)
{
assert(n <= 0x80000000U);
n--;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n++;
return n;
}
//
//
//
uint32_t
pow2_rd_u32(uint32_t n)
{
assert(n > 0);
return 1u << msb_idx_u32(n);
}
//
// ASSUMES NON-ZERO
//
uint32_t
msb_idx_u32(uint32_t n)
{
assert(n > 0);
#if defined(_MSC_VER) && !defined(__clang__)
uint32_t index;
_BitScanReverse((unsigned long *)&index, n);
return index;
#elif defined(__GNUC__)
return __builtin_clz(n) ^ 31;
#else
#error "No msb_index()"
#endif
}
//
//
//

View File

@ -0,0 +1,59 @@
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_
//
//
//
#include <stdbool.h>
#include <stdint.h>
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
//
//
//
// Return true iff |n| is a power of 2.
bool
is_pow2_u32(uint32_t n);
// Return |n| rounded-up to the nearest power of 2.
// If |n| is zero then return 0.
// REQUIRES: |n <= 0x80000000|.
uint32_t
pow2_ru_u32(uint32_t n);
// Return |n| rounded-down to the nearest power of 2.
// REQUIRES: |n > 0|.
uint32_t
pow2_rd_u32(uint32_t n);
// Return the most-significant bit position for |n|.
// REQUIRES: |n > 0|.
uint32_t
msb_idx_u32(uint32_t n); // 0-based bit position
//
//
//
#ifdef __cplusplus
}
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_UTIL_H_

View File

@ -0,0 +1,108 @@
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
//
//
#include <stdio.h>
#include <stdlib.h>
//
//
//
#include "assert.h"
//
//
//
#define VK_RESULT_TO_STRING(result) \
case result: \
return #result
//
// FIXME -- results and errors
//
char const *
vk_get_result_string(VkResult const result)
{
switch (result)
{
//
// Results
//
VK_RESULT_TO_STRING(VK_SUCCESS);
VK_RESULT_TO_STRING(VK_NOT_READY);
VK_RESULT_TO_STRING(VK_TIMEOUT);
VK_RESULT_TO_STRING(VK_EVENT_SET);
VK_RESULT_TO_STRING(VK_EVENT_RESET);
VK_RESULT_TO_STRING(VK_INCOMPLETE);
//
// Errors
//
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_HOST_MEMORY);
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_DEVICE_MEMORY);
VK_RESULT_TO_STRING(VK_ERROR_INITIALIZATION_FAILED);
VK_RESULT_TO_STRING(VK_ERROR_DEVICE_LOST);
VK_RESULT_TO_STRING(VK_ERROR_MEMORY_MAP_FAILED);
VK_RESULT_TO_STRING(VK_ERROR_LAYER_NOT_PRESENT);
VK_RESULT_TO_STRING(VK_ERROR_EXTENSION_NOT_PRESENT);
VK_RESULT_TO_STRING(VK_ERROR_FEATURE_NOT_PRESENT);
VK_RESULT_TO_STRING(VK_ERROR_INCOMPATIBLE_DRIVER);
VK_RESULT_TO_STRING(VK_ERROR_TOO_MANY_OBJECTS);
VK_RESULT_TO_STRING(VK_ERROR_FORMAT_NOT_SUPPORTED);
VK_RESULT_TO_STRING(VK_ERROR_FRAGMENTED_POOL);
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_POOL_MEMORY);
VK_RESULT_TO_STRING(VK_ERROR_INVALID_EXTERNAL_HANDLE);
VK_RESULT_TO_STRING(VK_ERROR_SURFACE_LOST_KHR);
VK_RESULT_TO_STRING(VK_ERROR_NATIVE_WINDOW_IN_USE_KHR);
VK_RESULT_TO_STRING(VK_SUBOPTIMAL_KHR);
VK_RESULT_TO_STRING(VK_ERROR_OUT_OF_DATE_KHR);
VK_RESULT_TO_STRING(VK_ERROR_INCOMPATIBLE_DISPLAY_KHR);
VK_RESULT_TO_STRING(VK_ERROR_VALIDATION_FAILED_EXT);
VK_RESULT_TO_STRING(VK_ERROR_INVALID_SHADER_NV);
VK_RESULT_TO_STRING(VK_ERROR_FRAGMENTATION_EXT);
VK_RESULT_TO_STRING(VK_ERROR_NOT_PERMITTED_EXT);
//
// Extensions: vk_xyz
//
default:
return "UNKNOWN VULKAN RESULT";
}
}
//
//
//
VkResult
vk_assert(VkResult const result, char const * const file, int const line, bool const is_abort)
{
if (result != VK_SUCCESS)
{
char const * const vk_result_str = vk_get_result_string(result);
fprintf(stderr,
"\"%s\", line %d: vk_assert( %d ) = \"%s\"\n",
file,
line,
result,
vk_result_str);
if (is_abort)
{
abort();
}
}
return result;
}
//
//
//

View File

@ -0,0 +1,52 @@
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_
//
//
//
#include <stdbool.h>
#include <vulkan/vulkan.h>
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
//
//
//
char const *
vk_get_result_string(VkResult const result);
VkResult
vk_assert(VkResult const result, char const * const file, int const line, bool const is_abort);
//
// clang-format off
//
#define vk(...) vk_assert((vk##__VA_ARGS__), __FILE__, __LINE__, true);
#define vk_ok(err) vk_assert(err, __FILE__, __LINE__, true);
//
// clang-format on
//
#ifdef __cplusplus
}
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_ASSERT_H_

View File

@ -0,0 +1,305 @@
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
//
//
#include "barrier.h"
//
//
//
void
vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT | //
VK_ACCESS_SHADER_READ_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | //
VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_barrier_compute_w_to_host_r(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_HOST_READ_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_HOST_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_HOST_READ_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_HOST_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//
void
vk_memory_barrier(VkCommandBuffer cb,
VkPipelineStageFlags src_stage,
VkAccessFlags src_mask,
VkPipelineStageFlags dst_stage,
VkAccessFlags dst_mask)
{
VkMemoryBarrier const mb = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = src_mask,
.dstAccessMask = dst_mask };
vkCmdPipelineBarrier(cb, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
}
//
//
//
void
vk_barrier_debug(VkCommandBuffer cb)
{
static VkMemoryBarrier const mb = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT | //
VK_ACCESS_INDEX_READ_BIT | //
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | //
VK_ACCESS_UNIFORM_READ_BIT | //
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | //
VK_ACCESS_SHADER_READ_BIT | //
VK_ACCESS_SHADER_WRITE_BIT | //
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | //
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | //
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | //
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | //
VK_ACCESS_TRANSFER_READ_BIT | //
VK_ACCESS_TRANSFER_WRITE_BIT | //
VK_ACCESS_HOST_READ_BIT | //
VK_ACCESS_HOST_WRITE_BIT,
.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT | //
VK_ACCESS_INDEX_READ_BIT | //
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | //
VK_ACCESS_UNIFORM_READ_BIT | //
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | //
VK_ACCESS_SHADER_READ_BIT | //
VK_ACCESS_SHADER_WRITE_BIT | //
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | //
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | //
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | //
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | //
VK_ACCESS_TRANSFER_READ_BIT | //
VK_ACCESS_TRANSFER_WRITE_BIT | //
VK_ACCESS_HOST_READ_BIT | //
VK_ACCESS_HOST_WRITE_BIT
};
vkCmdPipelineBarrier(cb,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0,
1,
&mb,
0,
NULL,
0,
NULL);
}
//
//
//

View File

@ -0,0 +1,72 @@
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
#define SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_
//
//
//
#include <vulkan/vulkan_core.h>
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
//
//
//
void
vk_barrier_compute_w_to_compute_r(VkCommandBuffer cb);
void
vk_barrier_compute_w_to_transfer_r(VkCommandBuffer cb);
void
vk_barrier_transfer_w_to_compute_r(VkCommandBuffer cb);
void
vk_barrier_transfer_w_to_compute_w(VkCommandBuffer cb);
void
vk_barrier_compute_w_to_indirect_compute_r(VkCommandBuffer cb);
void
vk_barrier_transfer_w_compute_w_to_transfer_r(VkCommandBuffer cb);
void
vk_barrier_compute_w_to_host_r(VkCommandBuffer cb);
void
vk_barrier_transfer_w_to_host_r(VkCommandBuffer cb);
void
vk_memory_barrier(VkCommandBuffer cb,
VkPipelineStageFlags src_stage,
VkAccessFlags src_mask,
VkPipelineStageFlags dst_stage,
VkAccessFlags dst_mask);
void
vk_barrier_debug(VkCommandBuffer cb);
//
//
//
#ifdef __cplusplus
}
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_COMMON_VK_BARRIER_H_

View File

@ -0,0 +1,40 @@
# Copyright © 2022 Konstantin Seurer
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
subdir('shaders')
radix_sort_files = files(
'common/vk/assert.c',
'common/vk/assert.h',
'common/vk/barrier.c',
'common/vk/barrier.h',
'common/macros.h',
'common/util.c',
'common/util.h',
'shaders/push.h',
'targets/u64/config.h',
'radix_sort_vk_devaddr.h',
'radix_sort_vk_ext.h',
'radix_sort_vk.c',
'radix_sort_vk.h',
'radv_radix_sort.c',
'radv_radix_sort.h',
'target.h'
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,384 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_
//
//
//
#include <vulkan/vulkan_core.h>
//
//
//
#include <stdbool.h>
#include <stdint.h>
//
//
//
#include "target.h"
//
// Radix Sort Vk is a high-performance sorting library for Vulkan 1.2.
//
// The sorting function is both directly and indirectly dispatchable.
//
#ifdef __cplusplus
extern "C" {
#endif
//
// Get a Radix Sort target's Vulkan requirements.
//
// A Radix Sort target is a binary image containing configuration parameters and
// a bundle of SPIR-V modules.
//
// Targets are prebuilt and specific to a particular device vendor, architecture
// and key-val configuration.
//
// A Radix Sort instance can only be created with a VkDevice that is initialized
// with all of the target's required extensions and features.
//
// The `radix_sort_vk_target_get_requirements()` function yields the extensions
// and initialized feature flags required by a Radix Sort target.
//
// These requirements can be merged with other Vulkan library requirements
// before VkDevice creation.
//
// If the `.ext_names` member is NULL, the `.ext_name_count` member will be
// initialized.
//
// Returns `false` if:
//
// * The .ext_names field is NULL and the number of required extensions is
// greater than zero.
// * The .ext_name_count is less than the number of required extensions is
// greater than zero.
// * Any of the .pdf, .pdf11 or .pdf12 members are NULL.
//
// Otherwise, returns true.
//
typedef struct radix_sort_vk_target radix_sort_vk_target_t;
//
// NOTE: The library currently supports uint32_t and uint64_t keyvals.
//
#define RS_KV_DWORDS_MAX 2
//
//
//
struct rs_pipeline_layout_scatter
{
VkPipelineLayout even;
VkPipelineLayout odd;
};
struct rs_pipeline_scatter
{
VkPipeline even;
VkPipeline odd;
};
//
//
//
struct rs_pipeline_layouts_named
{
VkPipelineLayout init;
VkPipelineLayout fill;
VkPipelineLayout histogram;
VkPipelineLayout prefix;
struct rs_pipeline_layout_scatter scatter[RS_KV_DWORDS_MAX];
};
struct rs_pipelines_named
{
VkPipeline init;
VkPipeline fill;
VkPipeline histogram;
VkPipeline prefix;
struct rs_pipeline_scatter scatter[RS_KV_DWORDS_MAX];
};
// clang-format off
#define RS_PIPELINE_LAYOUTS_HANDLES (sizeof(struct rs_pipeline_layouts_named) / sizeof(VkPipelineLayout))
#define RS_PIPELINES_HANDLES (sizeof(struct rs_pipelines_named) / sizeof(VkPipeline))
// clang-format on
//
//
//
struct radix_sort_vk
{
struct radix_sort_vk_target_config config;
union
{
struct rs_pipeline_layouts_named named;
VkPipelineLayout handles[RS_PIPELINE_LAYOUTS_HANDLES];
} pipeline_layouts;
union
{
struct rs_pipelines_named named;
VkPipeline handles[RS_PIPELINES_HANDLES];
} pipelines;
struct
{
struct
{
VkDeviceSize offset;
VkDeviceSize range;
} histograms;
struct
{
VkDeviceSize offset;
} partitions;
} internal;
};
//
// Create a Radix Sort instance for a target.(VkCommandBuffer cb,
//
// Keyval size is implicitly determined by the target.
//
// Returns NULL on failure.
//
typedef struct radix_sort_vk radix_sort_vk_t;
//
//
//
radix_sort_vk_t *
radix_sort_vk_create(VkDevice device,
VkAllocationCallbacks const * ac,
VkPipelineCache pc,
const uint32_t* const* spv,
const uint32_t* spv_sizes,
struct radix_sort_vk_target_config config);
//
// Destroy the Radix Sort instance using the same device and allocator used at
// creation.
//
void
radix_sort_vk_destroy(radix_sort_vk_t * rs, //
VkDevice d,
VkAllocationCallbacks const * ac);
//
// Returns the buffer size and alignment requirements for a maximum number of
// keyvals.
//
// The radix sort implementation is not an in-place sorting algorithm so two
// non-overlapping keyval buffers are required that are at least
// `.keyvals_size`.
//
// The radix sort instance also requires an `internal` buffer during sorting.
//
// If the indirect dispatch sorting function is used, then an `indirect` buffer
// is also required.
//
// The alignment requirements for the keyval, internal, and indirect buffers
// must be honored. All alignments are power of 2.
//
// Input:
// count : Maximum number of keyvals
//
// Outputs:
// keyval_size : Size of a single keyval
//
// keyvals_size : Minimum size of the even and odd keyval buffers
// keyvals_alignment : Alignment of each keyval buffer
//
// internal_size : Minimum size of internal buffer
// internal_aligment : Alignment of the internal buffer
//
// indirect_size : Minimum size of indirect buffer
// indirect_aligment : Alignment of the indirect buffer
//
// .keyvals_even/odd
// -----------------
// VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
// VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
//
// .internal
// ---------
// VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
// VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
// VK_BUFFER_USAGE_TRANSFER_DST_BIT ("direct" mode only)
//
// .indirect
// ---------
// VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
// VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
// VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT
//
typedef struct radix_sort_vk_memory_requirements
{
VkDeviceSize keyval_size;
VkDeviceSize keyvals_size;
VkDeviceSize keyvals_alignment;
VkDeviceSize internal_size;
VkDeviceSize internal_alignment;
VkDeviceSize indirect_size;
VkDeviceSize indirect_alignment;
} radix_sort_vk_memory_requirements_t;
void
radix_sort_vk_get_memory_requirements(radix_sort_vk_t const * rs,
uint32_t count,
radix_sort_vk_memory_requirements_t * mr);
//
// Direct dispatch sorting
// -----------------------
//
// Using a key size of `key_bits`, sort `count` keyvals found in the
// `.devaddr_keyvals_even` buffer.
//
// Each internal sorting pass copies the keyvals from one keyvals buffer to the
// other.
//
// The number of internal sorting passes is determined by `.key_bits`.
//
// If an even number of internal sorting passes is required, the sorted keyvals
// will be found in the "even" keyvals buffer. Otherwise, the sorted keyvals
// will be found in the "odd" keyvals buffer.
//
// Which buffer has the sorted keyvals is returned in `keyvals_sorted`.
//
// A keyval's `key_bits` are the most significant bits of a keyval.
//
// The maximum number of key bits is determined by the keyval size.
//
// The keyval count must be less than (1 << 30) as well as be less than or equal
// to the count used to obtain the the memory requirements.
//
// The info struct's `ext` member must be NULL.
//
// This function appends push constants, dispatch commands, and barriers.
//
// Pipeline barriers should be applied as necessary, both before and after
// invoking this function.
//
// The sort begins with either a TRANSFER/WRITE or a COMPUTE/READ to the
// `internal` and `keyvals_even` buffers.
//
// The sort ends with a COMPUTE/WRITE to the `internal` and `keyvals_sorted`
// buffers.
//
//
// Direct dispatch sorting using VkDescriptorBufferInfo structures
// ---------------------------------------------------------------
//
typedef struct radix_sort_vk_sort_info
{
void * ext;
uint32_t key_bits;
uint32_t count;
VkDescriptorBufferInfo keyvals_even;
VkDescriptorBufferInfo keyvals_odd;
VkDescriptorBufferInfo internal;
} radix_sort_vk_sort_info_t;
void
radix_sort_vk_sort(radix_sort_vk_t const * rs,
radix_sort_vk_sort_info_t const * info,
VkDevice device,
VkCommandBuffer cb,
VkDescriptorBufferInfo * keyvals_sorted);
//
// Indirect dispatch sorting
// -------------------------
//
// Using a key size of `key_bits`, at pipeline execution time, load keyvals
// count from `devaddr_count` and sorts the keyvals in `.devaddr_keyvals_even`.
//
// Each internal sorting pass copies the keyvals from one keyvals buffer to the
// other.
//
// The number of internal sorting passes is determined by `.key_bits`.
//
// If an even number of internal sorting passes is required, the sorted keyvals
// will be found in the "even" keyvals buffer. Otherwise, the sorted keyvals
// will be found in the "odd" keyvals buffer.
//
// Which buffer has the sorted keyvals is returned in `keyvals_sorted`.
//
// A keyval's `key_bits` are the most significant bits of a keyval.
//
// The keyval count must be less than (1 << 30) as well as be less than or equal
// to the count used to obtain the the memory requirements.
//
// The info struct's `ext` member must be NULL.
//
// This function appends push constants, dispatch commands, and barriers.
//
// Pipeline barriers should be applied as necessary, both before and after
// invoking this function.
//
// The indirect radix sort begins with a COMPUTE/READ from the `count` buffer
// and ends with a COMPUTE/WRITE to the `internal` and the `keyvals_sorted`
// buffers.
//
// The `indirect` buffer must support USAGE_INDIRECT.
//
// The `count` buffer must be at least 4 bytes and 4-byte aligned.
//
//
// Indirect dispatch sorting using VkDescriptorBufferInfo structures
// -----------------------------------------------------------------
//
typedef struct radix_sort_vk_sort_indirect_info
{
void * ext;
uint32_t key_bits;
VkDescriptorBufferInfo count;
VkDescriptorBufferInfo keyvals_even;
VkDescriptorBufferInfo keyvals_odd;
VkDescriptorBufferInfo internal;
VkDescriptorBufferInfo indirect;
} radix_sort_vk_sort_indirect_info_t;
void
radix_sort_vk_sort_indirect(radix_sort_vk_t const * rs,
radix_sort_vk_sort_indirect_info_t const * info,
VkDevice device,
VkCommandBuffer cb,
VkDescriptorBufferInfo * keyvals_sorted);
//
//
//
#ifdef __cplusplus
}
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_H_

View File

@ -0,0 +1,104 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_
//
//
//
#include "radix_sort_vk.h"
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
//
// Structure that enables integration with Vulkan drivers.
//
typedef struct radix_sort_vk_buffer_info
{
VkBuffer buffer;
VkDeviceSize offset;
VkDeviceAddress devaddr;
} radix_sort_vk_buffer_info_t;
//
// Function prototypes
//
typedef void (*radix_sort_vk_fill_buffer_pfn)(VkCommandBuffer cb,
radix_sort_vk_buffer_info_t const * buffer_info,
VkDeviceSize offset,
VkDeviceSize size,
uint32_t data);
typedef void (*radix_sort_vk_dispatch_indirect_pfn)(VkCommandBuffer cb,
radix_sort_vk_buffer_info_t const * buffer_info,
VkDeviceSize offset);
//
// Direct dispatch sorting using buffer device addresses
// -----------------------------------------------------
//
typedef struct radix_sort_vk_sort_devaddr_info
{
void * ext;
uint32_t key_bits;
uint32_t count;
radix_sort_vk_buffer_info_t keyvals_even;
VkDeviceAddress keyvals_odd;
radix_sort_vk_buffer_info_t internal;
radix_sort_vk_fill_buffer_pfn fill_buffer;
} radix_sort_vk_sort_devaddr_info_t;
void
radix_sort_vk_sort_devaddr(radix_sort_vk_t const * rs,
radix_sort_vk_sort_devaddr_info_t const * info,
VkDevice device,
VkCommandBuffer cb,
VkDeviceAddress * keyvals_sorted);
//
// Indirect dispatch sorting using buffer device addresses
// -------------------------------------------------------
//
// clang-format off
//
typedef struct radix_sort_vk_sort_indirect_devaddr_info
{
void * ext;
uint32_t key_bits;
VkDeviceAddress count;
VkDeviceAddress keyvals_even;
VkDeviceAddress keyvals_odd;
VkDeviceAddress internal;
radix_sort_vk_buffer_info_t indirect;
radix_sort_vk_dispatch_indirect_pfn dispatch_indirect;
} radix_sort_vk_sort_indirect_devaddr_info_t;
void
radix_sort_vk_sort_indirect_devaddr(radix_sort_vk_t const * rs,
radix_sort_vk_sort_indirect_devaddr_info_t const * info,
VkDevice device,
VkCommandBuffer cb,
VkDeviceAddress * keyvals_sorted);
//
// clang-format on
//
#ifdef __cplusplus
}
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_INCLUDE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_DEVADDR_H_

View File

@ -0,0 +1,77 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_
//
//
//
#include <vulkan/vulkan_core.h>
//
//
//
#include <stdbool.h>
#include <stdint.h>
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
//
// Radix sort extensions
// ---------------------
//
#ifndef RADIX_SORT_VK_DISABLE_EXTENSIONS
//
// Extension types
//
enum radix_sort_vk_ext_type
{
RADIX_SORT_VK_EXT_TIMESTAMPS
};
//
// Timestamp each logical step of the algorithm
//
// Number of timestamps is: 5 + (number of subpasses)
//
// * direct dispatch: 4 + subpass count
// * indirect dispatch: 5 + subpass count
//
// Indirect / 32-bit keyvals: 9
// Indirect / 64-bit keyvals: 13
//
struct radix_sort_vk_ext_timestamps
{
void * ext;
enum radix_sort_vk_ext_type type;
uint32_t timestamp_count;
VkQueryPool timestamps;
uint32_t timestamps_set;
};
#endif
//
//
//dsc
#ifdef __cplusplus
}
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_RADIX_SORT_VK_EXT_H_

View File

@ -0,0 +1,193 @@
/*
* Copyright © 2022 Konstantin Seurer
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "radv_radix_sort.h"
#include "targets/u64/config.h"
#include "radv_private.h"
#include "target.h"
static const uint32_t init_spv[] = {
#include "radix_sort/shaders/init.comp.spv.h"
};
static const uint32_t fill_spv[] = {
#include "radix_sort/shaders/fill.comp.spv.h"
};
static const uint32_t histogram_spv[] = {
#include "radix_sort/shaders/histogram.comp.spv.h"
};
static const uint32_t prefix_spv[] = {
#include "radix_sort/shaders/prefix.comp.spv.h"
};
static const uint32_t scatter_0_even_spv[] = {
#include "radix_sort/shaders/scatter_0_even.comp.spv.h"
};
static const uint32_t scatter_0_odd_spv[] = {
#include "radix_sort/shaders/scatter_0_odd.comp.spv.h"
};
static const uint32_t scatter_1_even_spv[] = {
#include "radix_sort/shaders/scatter_1_even.comp.spv.h"
};
static const uint32_t scatter_1_odd_spv[] = {
#include "radix_sort/shaders/scatter_1_odd.comp.spv.h"
};
static const struct radix_sort_vk_target_config target_config = {
.keyval_dwords = RS_KEYVAL_DWORDS,
.histogram =
{
.workgroup_size_log2 = RS_HISTOGRAM_WORKGROUP_SIZE_LOG2,
.subgroup_size_log2 = RS_HISTOGRAM_SUBGROUP_SIZE_LOG2,
.block_rows = RS_HISTOGRAM_BLOCK_ROWS,
},
.prefix =
{
.workgroup_size_log2 = RS_PREFIX_WORKGROUP_SIZE_LOG2,
.subgroup_size_log2 = RS_PREFIX_SUBGROUP_SIZE_LOG2,
},
.scatter =
{
.workgroup_size_log2 = RS_SCATTER_WORKGROUP_SIZE_LOG2,
.subgroup_size_log2 = RS_SCATTER_SUBGROUP_SIZE_LOG2,
.block_rows = RS_SCATTER_BLOCK_ROWS,
},
};
radix_sort_vk_t *
radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac, VkPipelineCache pc)
{
const uint32_t *spv[8] = {
init_spv, fill_spv, histogram_spv, prefix_spv,
scatter_0_even_spv, scatter_0_odd_spv, scatter_1_even_spv, scatter_1_odd_spv,
};
const uint32_t spv_sizes[8] = {
sizeof(init_spv), sizeof(fill_spv), sizeof(histogram_spv),
sizeof(prefix_spv), sizeof(scatter_0_even_spv), sizeof(scatter_0_odd_spv),
sizeof(scatter_1_even_spv), sizeof(scatter_1_odd_spv),
};
return radix_sort_vk_create(device, ac, pc, spv, spv_sizes, target_config);
}
VKAPI_ATTR VkResult VKAPI_CALL
vkCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule)
{
RADV_FROM_HANDLE(radv_device, pdevice, device);
return pdevice->vk.dispatch_table.CreateShaderModule(device, pCreateInfo, pAllocator,
pShaderModule);
}
VKAPI_ATTR void VKAPI_CALL
vkDestroyShaderModule(VkDevice device, VkShaderModule shaderModule,
const VkAllocationCallbacks *pAllocator)
{
RADV_FROM_HANDLE(radv_device, pdevice, device);
pdevice->vk.dispatch_table.DestroyShaderModule(device, shaderModule, pAllocator);
}
VKAPI_ATTR VkResult VKAPI_CALL
vkCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout)
{
RADV_FROM_HANDLE(radv_device, pdevice, device);
return pdevice->vk.dispatch_table.CreatePipelineLayout(device, pCreateInfo, pAllocator,
pPipelineLayout);
}
VKAPI_ATTR void VKAPI_CALL
vkDestroyPipelineLayout(VkDevice device, VkPipelineLayout pipelineLayout,
const VkAllocationCallbacks *pAllocator)
{
RADV_FROM_HANDLE(radv_device, pdevice, device);
pdevice->vk.dispatch_table.DestroyPipelineLayout(device, pipelineLayout, pAllocator);
}
VKAPI_ATTR VkResult VKAPI_CALL
vkCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t createInfoCount,
const VkComputePipelineCreateInfo *pCreateInfos,
const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
{
RADV_FROM_HANDLE(radv_device, pdevice, device);
return pdevice->vk.dispatch_table.CreateComputePipelines(device, pipelineCache, createInfoCount,
pCreateInfos, pAllocator, pPipelines);
}
VKAPI_ATTR void VKAPI_CALL
vkDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator)
{
RADV_FROM_HANDLE(radv_device, pdevice, device);
return pdevice->vk.dispatch_table.DestroyPipeline(device, pipeline, pAllocator);
}
VKAPI_ATTR void VKAPI_CALL
vkCmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags,
uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
uint32_t bufferMemoryBarrierCount,
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
uint32_t imageMemoryBarrierCount,
const VkImageMemoryBarrier *pImageMemoryBarriers)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer->device->vk.dispatch_table.CmdPipelineBarrier(
commandBuffer, srcStageMask, dstStageMask, dependencyFlags, memoryBarrierCount,
pMemoryBarriers, bufferMemoryBarrierCount, pBufferMemoryBarriers, imageMemoryBarrierCount,
pImageMemoryBarriers);
}
VKAPI_ATTR void VKAPI_CALL
vkCmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
const void *pValues)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer->device->vk.dispatch_table.CmdPushConstants(commandBuffer, layout, stageFlags, offset,
size, pValues);
}
VKAPI_ATTR void VKAPI_CALL
vkCmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
VkPipeline pipeline)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer->device->vk.dispatch_table.CmdBindPipeline(commandBuffer, pipelineBindPoint,
pipeline);
}
VKAPI_ATTR void VKAPI_CALL
vkCmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY,
uint32_t groupCountZ)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer->device->vk.dispatch_table.CmdDispatch(commandBuffer, groupCountX, groupCountY,
groupCountZ);
}

View File

@ -0,0 +1,32 @@
/*
* Copyright © 2022 Konstantin Seurer
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef RADV_RADIX_SORT_H
#define RADV_RADIX_SORT_H
#include "radix_sort_vk_devaddr.h"
radix_sort_vk_t *radv_create_radix_sort_u64(VkDevice device, VkAllocationCallbacks const *ac,
VkPipelineCache pc);
#endif

View File

@ -0,0 +1,151 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_
//
// GLSL
//
#ifdef VULKAN // defined by GLSL/VK compiler
#extension GL_EXT_shader_explicit_arithmetic_types : require
//
// If the target does not support VkPhysicalDeviceFeatures.shaderInt64
// then:
//
// #define RS_DISABLE_SHADER_INT64
//
// clang-format off
#ifdef RS_DISABLE_SHADER_INT64
#extension GL_EXT_buffer_reference_uvec2 : require
#else
#extension GL_EXT_buffer_reference2 : require
#endif
// clang-format on
//
// Restrict shouldn't have any noticeable impact on these kernels and
// benchmarks appear to prove that true but it's correct to include
// the qualifier.
//
#define RS_RESTRICT restrict
//
// If the device doesn't support .shaderInt64 then the buffer reference address
// is a uvec2.
//
#ifdef RS_DISABLE_SHADER_INT64
#define RS_DEVADDR u32vec2
#else
#define RS_DEVADDR uint64_t
#endif
//
// Define a buffer reference.
//
#define RS_BUFREF_DEFINE(_layout, _name, _devaddr) RS_RESTRICT _layout _name = _layout(_devaddr)
//
// Define a buffer reference at a UINT32 offset.
//
#ifdef RS_DISABLE_SHADER_INT64
#define RS_BUFREF_DEFINE_AT_OFFSET_UINT32(_layout, _name, _devaddr_u32vec2, _offset) \
RS_RESTRICT _layout _name; \
{ \
u32vec2 devaddr; \
uint32_t carry; \
\
devaddr.x = uaddCarry(_devaddr_u32vec2.x, _offset, carry); \
devaddr.y = _devaddr_u32vec2.y + carry; \
\
_name = _layout(devaddr); \
}
#else
#define RS_BUFREF_DEFINE_AT_OFFSET_UINT32(_layout, _name, _devaddr, _offset) \
RS_RESTRICT _layout _name = _layout(_devaddr + _offset)
#endif
//
// Define a buffer reference at a packed UINT64 offset.
//
#ifdef RS_DISABLE_SHADER_INT64
#define RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(_layout, _name, _devaddr_u32vec2, _offset_u32vec2) \
RS_RESTRICT _layout _name; \
{ \
u32vec2 devaddr; \
uint32_t carry; \
\
devaddr.x = uaddCarry(_devaddr_u32vec2.x, _offset_u32vec2.x, carry); \
devaddr.y = _devaddr_u32vec2.y + _offset_u32vec2.y + carry; \
\
_name = _layout(devaddr); \
}
#else
#define RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(_layout, _name, _devaddr, _offset_u32vec2) \
RS_RESTRICT _layout _name = _layout(_devaddr + pack64(_offset_u32vec2))
#endif
//
// Increment the buffer reference by a UINT32 offset.
//
#ifdef RS_DISABLE_SHADER_INT64
#define RS_BUFREF_INC_UINT32(_layout, _name, _inc) \
{ \
u32vec2 devaddr = u32vec2(_name); \
uint32_t carry; \
\
devaddr.x = uaddCarry(devaddr.x, _inc, carry); \
devaddr.y = devaddr.y + carry; \
\
_name = _layout(devaddr); \
}
#else
#define RS_BUFREF_INC_UINT32(_layout, _name, _inc) _name = _layout(uint64_t(_name) + _inc)
#endif
//
// Increment the buffer reference by a packed UINT64 offset.
//
#ifdef RS_DISABLE_SHADER_INT64
#define RS_BUFREF_INC_U32VEC2(_layout, _name, _inc_u32vec2) \
{ \
u32vec2 devaddr = u32vec2(_name); \
uint32_t carry; \
\
devaddr.x = uaddCarry(devaddr.x, _inc_u32vec2.x, carry); \
devaddr.y = devaddr.y + _inc_u32vec2.y + carry; \
\
_name = _layout(devaddr); \
}
#else
#define RS_BUFREF_INC_U32VEC2(_layout, _name, _inc_u32vec2) \
_name = _layout(uint64_t(_name) + pack64(_inc_u32vec2))
#endif
//
// Increment the buffer reference by the product of two UINT32 factors.
//
#define RS_BUFREF_INC_UINT32_UINT32(_layout, _name, _inc_a, _inc_b) \
{ \
u32vec2 inc; \
\
umulExtended(_inc_a, _inc_b, inc.y, inc.x); \
\
RS_BUFREF_INC_U32VEC2(_layout, _name, inc); \
}
//
//
//
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_BUFREF_H_

View File

@ -0,0 +1,143 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// Each workgroup fills up to RS_BLOCK_KEYVALS
//
// clang-format off
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_control_flow_attributes : require
// clang-format on
//
// Load arch/keyval configuration
//
#include "config.h"
//
// Buffer reference macros and push constants
//
#include "bufref.h"
#include "push.h"
//
// Subgroup uniform support
//
#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
#extension GL_EXT_subgroupuniform_qualifier : required
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
#else
#define RS_SUBGROUP_UNIFORM
#endif
//
// Declare the push constants
//
RS_STRUCT_PUSH_FILL();
layout(push_constant) uniform block_push
{
rs_push_fill push;
};
//
// The "init" shader configures the fill info structure.
//
RS_STRUCT_INDIRECT_INFO_FILL();
//
// Check all switches are defined
//
#ifndef RS_FILL_WORKGROUP_SIZE_LOG2
#error "Undefined: RS_FILL_WORKGROUP_SIZE_LOG2"
#endif
//
#ifndef RS_FILL_BLOCK_ROWS
#error "Undefined: RS_FILL_BLOCK_ROWS"
#endif
//
// Local macros
//
// clang-format off
#define RS_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
#define RS_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_WORKGROUP_SIZE)
#define RS_RADIX_MASK ((1 << RS_RADIX_LOG2) - 1)
// clang-format on
//
//
//
layout(local_size_x = RS_WORKGROUP_SIZE) in;
//
//
//
layout(buffer_reference, std430) buffer buffer_rs_indirect_info_fill
{
rs_indirect_info_fill info;
};
layout(buffer_reference, std430) buffer buffer_rs_dwords
{
uint32_t extent[];
};
//
//
//
void
main()
{
//
// Define indirect info bufref for the fill
//
readonly RS_BUFREF_DEFINE(buffer_rs_indirect_info_fill, rs_info, push.devaddr_info);
RS_SUBGROUP_UNIFORM const rs_indirect_info_fill info = rs_info.info;
//
// Define dwords bufref
//
// Assumes less than 2^32-1 keys and then extended multiplies it by
// the keyval size.
//
// Assumes push.devaddr_dwords_base is suitably aligned to
// RS_BLOCK_DWORDS -- at a subgroup or transaction size is fine.
//
const uint32_t dwords_idx =
(info.block_offset + gl_WorkGroupID.x) * RS_BLOCK_DWORDS + gl_LocalInvocationID.x;
u32vec2 dwords_offset;
umulExtended(dwords_idx, 4, dwords_offset.y, dwords_offset.x);
writeonly RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_dwords,
rs_dwords,
push.devaddr_dwords,
dwords_offset);
//
// Fills are always aligned to RS_BLOCK_KEYVALS
//
// ((v >= min) && (v < max)) == ((v - min) < (max - min))
//
const uint32_t row_idx = dwords_idx - info.dword_offset_min;
[[unroll]] for (uint32_t ii = 0; ii < RS_FILL_BLOCK_ROWS; ii++)
{
if (row_idx + (ii * RS_WORKGROUP_SIZE) < info.dword_offset_max_minus_min)
{
rs_dwords.extent[ii * RS_WORKGROUP_SIZE] = push.dword;
}
}
}
//
//
//

View File

@ -0,0 +1,449 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// Produce multiple radix size histograms from the keyvals.
//
// clang-format off
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_control_flow_attributes : require
#extension GL_KHR_shader_subgroup_basic : require
// clang-format on
//
//
//
#include "config.h"
//
// Optional switches:
//
// #define RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT
// #define RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
//
//
// Buffer reference macros and push constants
//
#include "bufref.h"
#include "push.h"
//
// Push constants for histogram shader
//
RS_STRUCT_PUSH_HISTOGRAM();
layout(push_constant) uniform block_push
{
rs_push_histogram push;
};
//
// Subgroup uniform support
//
#if defined(RS_HISTOGRAM_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
#extension GL_EXT_subgroupuniform_qualifier : required
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
#else
#define RS_SUBGROUP_UNIFORM
#endif
//
// Check all switches are defined
//
// What's the size of the keyval?
#ifndef RS_KEYVAL_DWORDS
#error "Undefined: RS_KEYVAL_DWORDS"
#endif
//
#ifndef RS_HISTOGRAM_BLOCK_ROWS
#error "Undefined: RS_HISTOGRAM_BLOCK_ROWS"
#endif
//
#ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2
#error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2"
#endif
//
#ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2
#error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2"
#endif
//
// Local macros
//
// clang-format off
#define RS_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
#define RS_SUBGROUP_SIZE (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2)
#define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
#define RS_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE)
#define RS_KEYVAL_SIZE (RS_KEYVAL_DWORDS * 4)
#define RS_RADIX_MASK ((1 << RS_RADIX_LOG2) - 1)
// clang-format on
//
// Keyval type
//
#if (RS_KEYVAL_DWORDS == 1)
#define RS_KEYVAL_TYPE uint32_t
#elif (RS_KEYVAL_DWORDS == 2)
#define RS_KEYVAL_TYPE u32vec2
#else
#error "Unsupported RS_KEYVAL_DWORDS"
#endif
//
// Histogram offset depends on number of workgroups.
//
#define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_)
#if (RS_WORKGROUP_SUBGROUPS == 1)
#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4)
#else
#define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4)
#endif
//
// Assumes (RS_RADIX_LOG2 == 8)
//
// Error if this ever changes
//
#if (RS_RADIX_LOG2 != 8)
#error "(RS_RADIX_LOG2 != 8)"
#endif
//
// Is bitfield extract faster?
//
#ifdef RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT
//----------------------------------------------------------------------
//
// Extract a keyval digit
//
#if (RS_KEYVAL_DWORDS == 1)
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) bitfieldExtract(kv_, pass_ * RS_RADIX_LOG2, RS_RADIX_LOG2)
#else
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) \
bitfieldExtract(kv_[pass_ / 4], (pass_ & 3) * RS_RADIX_LOG2, RS_RADIX_LOG2)
#endif
//----------------------------------------------------------------------
#else
//----------------------------------------------------------------------
//
// Extract a keyval digit
//
#if (RS_KEYVAL_DWORDS == 1)
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) ((kv_ >> (pass_ * RS_RADIX_LOG2)) & RS_RADIX_MASK)
#else
#define RS_KV_EXTRACT_DIGIT(kv_, pass_) \
((kv_[pass_ / 4] >> ((pass_ & 3) * RS_RADIX_LOG2)) & RS_RADIX_MASK)
#endif
//----------------------------------------------------------------------
#endif
//
//
//
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
struct rs_histogram_smem
{
uint32_t histogram[RS_RADIX_SIZE];
};
shared rs_histogram_smem smem;
#endif
//
//
//
layout(local_size_x = RS_WORKGROUP_SIZE) in;
//
//
//
layout(buffer_reference, std430) buffer buffer_rs_kv
{
RS_KEYVAL_TYPE extent[];
};
layout(buffer_reference, std430) buffer buffer_rs_histograms
{
uint32_t extent[];
};
//
// Shared memory functions
//
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
//
// NOTE: Must use same access pattern as rs_histogram_zero()
//
void
rs_histogram_zero()
{
//
// Zero SMEM histogram
//
#if (RS_WORKGROUP_SUBGROUPS == 1)
const uint32_t smem_offset = gl_SubgroupInvocationID;
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
{
smem.histogram[smem_offset + ii] = 0;
}
#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
const uint32_t smem_offset = gl_LocalInvocationID.x;
[[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
{
smem.histogram[smem_offset + ii] = 0;
}
const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
if (smem_idx < RS_RADIX_SIZE)
{
smem.histogram[smem_idx] = 0;
}
#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
#endif
{
smem.histogram[gl_LocalInvocationID.x] = 0;
}
#endif
}
//
// NOTE: Must use same access pattern as rs_histogram_zero()
//
void
rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms)
{
//
// Store to GMEM
//
#if (RS_WORKGROUP_SUBGROUPS == 1)
const uint32_t smem_offset = gl_SubgroupInvocationID;
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE)
{
const uint32_t count = smem.histogram[smem_offset + ii];
atomicAdd(rs_histograms.extent[ii], count);
}
#elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE)
const uint32_t smem_offset = gl_LocalInvocationID.x;
[[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE)
{
const uint32_t count = smem.histogram[smem_offset + ii];
atomicAdd(rs_histograms.extent[ii], count);
}
const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE);
if (smem_idx < RS_RADIX_SIZE)
{
const uint32_t count = smem.histogram[smem_idx];
atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)],
count);
}
#elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE)
#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
if (gl_LocalInvocationID.x < RS_RADIX_SIZE)
#endif
{
const uint32_t count = smem.histogram[gl_LocalInvocationID.x];
atomicAdd(rs_histograms.extent[0], count);
}
#endif
}
#endif
//
//
//
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
void
rs_histogram_atomic_after_write()
{
#if (RS_WORKGROUP_SUBGROUPS == 1)
subgroupMemoryBarrierShared();
#else
barrier();
#endif
}
void
rs_histogram_read_after_atomic()
{
#if (RS_WORKGROUP_SUBGROUPS == 1)
subgroupMemoryBarrierShared();
#else
barrier();
#endif
}
#endif
//
//
//
void
main()
{
//
// Which subgroups have work?
//
RS_KEYVAL_TYPE kv[RS_HISTOGRAM_BLOCK_ROWS];
//
// Define kv_in bufref
//
// Assumes less than 2^30-1 keys and then extended multiplies it
// by the keyval size.
//
u32vec2 kv_in_offset;
umulExtended(gl_WorkGroupID.x * RS_BLOCK_KEYVALS + gl_LocalInvocationID.x,
RS_KEYVAL_SIZE,
kv_in_offset.y, // msb
kv_in_offset.x); // lsb
readonly RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_kv,
rs_kv_in,
push.devaddr_keyvals,
kv_in_offset);
//
// Load keyvals
//
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_HISTOGRAM_BLOCK_ROWS; ii++)
{
kv[ii] = rs_kv_in.extent[ii * RS_WORKGROUP_SIZE];
}
////////////////////////////////////////////////////////////////////////////
//
// Accumulate and store histograms for passes
//
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//
// MACRO EXPANSION VARIANT
//
// NOTE: THIS ALSO SERVES AS A MALI R24+ WORKAROUND: EXPLICITLY
// EXPAND THE FOR/LOOP PASSES
//
#ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM
#define RS_HISTOGRAM_PASS(pass_) \
rs_histogram_zero(); \
\
rs_histogram_atomic_after_write(); \
\
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++) \
{ \
const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_); \
\
atomicAdd(smem.histogram[digit], 1); \
} \
\
rs_histogram_read_after_atomic(); \
\
{ \
const uint32_t rs_histogram_offset = RS_HISTOGRAM_OFFSET(pass_); \
\
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, \
rs_histograms, \
push.devaddr_histograms, \
rs_histogram_offset); \
\
rs_histogram_global_store(rs_histograms); \
} \
\
if (push.passes == (RS_KEYVAL_SIZE - pass_)) \
return;
#else // NO SHARED MEMORY
#define RS_HISTOGRAM_PASS(pass_) \
{ \
const uint32_t rs_histogram_base = RS_HISTOGRAM_BASE(pass_); \
\
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, \
rs_histograms, \
push.devaddr_histograms, \
rs_histogram_base); \
\
[[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++) \
{ \
const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_); \
\
atomicAdd(rs_histograms.extent[digit], 1); \
} \
} \
\
if (push.passes == (RS_KEYVAL_SIZE - pass_)) \
return;
#endif
#if (RS_KEYVAL_DWORDS == 1)
RS_HISTOGRAM_PASS(3)
RS_HISTOGRAM_PASS(2)
RS_HISTOGRAM_PASS(1)
RS_HISTOGRAM_PASS(0)
#elif (RS_KEYVAL_DWORDS == 2)
RS_HISTOGRAM_PASS(7)
RS_HISTOGRAM_PASS(6)
RS_HISTOGRAM_PASS(5)
RS_HISTOGRAM_PASS(4)
RS_HISTOGRAM_PASS(3)
RS_HISTOGRAM_PASS(2)
RS_HISTOGRAM_PASS(1)
RS_HISTOGRAM_PASS(0)
#else
#error "Error: (RS_KEYVAL_DWORDS >= 3) not implemented."
#endif
}
//
//
//

View File

@ -0,0 +1,168 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// Initialize the `rs_indirect_info` struct
//
// clang-format off
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_control_flow_attributes : require
// clang-format on
//
// Load arch/keyval configuration
//
#include "config.h"
//
// Buffer reference macros and push constants
//
#include "bufref.h"
#include "push.h"
//
// Subgroup uniform support
//
#if defined(RS_SCATTER_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
#extension GL_EXT_subgroupuniform_qualifier : required
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
#else
#define RS_SUBGROUP_UNIFORM
#endif
//
// Declare the push constants
//
RS_STRUCT_PUSH_INIT();
layout(push_constant) uniform block_push
{
rs_push_init push;
};
//
// The "init" shader configures the fill info structure.
//
RS_STRUCT_INDIRECT_INFO();
//
// Local macros
//
// clang-format off
#define RS_FILL_WORKGROUP_SIZE (1 << RS_FILL_WORKGROUP_SIZE_LOG2)
#define RS_SCATTER_WORKGROUP_SIZE (1 << RS_SCATTER_WORKGROUP_SIZE_LOG2)
#define RS_HISTOGRAM_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2)
#define RS_FILL_BLOCK_DWORDS (RS_FILL_BLOCK_ROWS * RS_FILL_WORKGROUP_SIZE)
#define RS_SCATTER_BLOCK_KEYVALS (RS_SCATTER_BLOCK_ROWS * RS_SCATTER_WORKGROUP_SIZE)
#define RS_HISTOGRAM_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_HISTOGRAM_WORKGROUP_SIZE)
// clang-format on
//
//
//
layout(local_size_x = 1) in;
//
//
//
layout(buffer_reference, std430) buffer buffer_rs_count
{
uint32_t count;
};
layout(buffer_reference, std430) buffer buffer_rs_indirect_info
{
rs_indirect_info info;
};
//
// Helper macros
//
// RU = Round Up
// RD = Round Down
//
#define RS_COUNT_RU_BLOCKS(count_, block_size_) ((count_ + block_size_ - 1) / (block_size_))
#define RS_COUNT_RD_BLOCKS(count_, block_size_) ((count_) / (block_size_))
//
//
//
void
main()
{
//
// Load the keyval count
//
readonly RS_BUFREF_DEFINE(buffer_rs_count, rs_count, push.devaddr_count);
RS_SUBGROUP_UNIFORM const uint32_t count = rs_count.count;
//
// Define the init struct bufref
//
writeonly RS_BUFREF_DEFINE(buffer_rs_indirect_info, rs_indirect_info, push.devaddr_info);
//
// Size and set scatter dispatch
//
const uint32_t scatter_ru_blocks = RS_COUNT_RU_BLOCKS(count, RS_SCATTER_BLOCK_KEYVALS);
const uint32_t count_ru_scatter = scatter_ru_blocks * RS_SCATTER_BLOCK_KEYVALS;
rs_indirect_info.info.dispatch.scatter = u32vec4(scatter_ru_blocks, 1, 1, 0);
//
// Size and set histogram dispatch
//
const uint32_t histo_ru_blocks = RS_COUNT_RU_BLOCKS(count_ru_scatter, RS_HISTOGRAM_BLOCK_KEYVALS);
const uint32_t count_ru_histo = histo_ru_blocks * RS_HISTOGRAM_BLOCK_KEYVALS;
rs_indirect_info.info.dispatch.histogram = u32vec4(histo_ru_blocks, 1, 1, 0);
//
// Size and set pad fill and dispatch
//
const uint32_t count_dwords = count * RS_KEYVAL_DWORDS;
const uint32_t pad_rd_blocks = RS_COUNT_RD_BLOCKS(count_dwords, RS_FILL_BLOCK_DWORDS);
const uint32_t count_rd_pad = pad_rd_blocks * RS_FILL_BLOCK_DWORDS;
const uint32_t count_ru_histo_dwords = count_ru_histo * RS_KEYVAL_DWORDS;
const uint32_t pad_dwords = count_ru_histo_dwords - count_rd_pad;
const uint32_t pad_ru_blocks = RS_COUNT_RU_BLOCKS(pad_dwords, RS_FILL_BLOCK_DWORDS);
rs_indirect_info_fill pad;
pad.block_offset = pad_rd_blocks;
pad.dword_offset_min = count_dwords;
pad.dword_offset_max_minus_min = count_ru_histo_dwords - count_dwords;
rs_indirect_info.info.pad = pad;
rs_indirect_info.info.dispatch.pad = u32vec4(pad_ru_blocks, 1, 1, 0);
//
// Size and set zero fill and dispatch
//
// NOTE(allanmac): We could zero the histogram passes on the host
// since the number of passes is known ahead of time but since the
// 256-dword partitions directly follow the 256-dword histograms, we
// can dispatch just one FILL.
//
rs_indirect_info_fill zero;
zero.block_offset = 0;
zero.dword_offset_min = 0;
zero.dword_offset_max_minus_min = (push.passes + scatter_ru_blocks - 1) * RS_RADIX_SIZE;
const uint32_t zero_ru_blocks =
RS_COUNT_RU_BLOCKS(zero.dword_offset_max_minus_min, RS_FILL_BLOCK_DWORDS);
rs_indirect_info.info.zero = zero;
rs_indirect_info.info.dispatch.zero = u32vec4(zero_ru_blocks, 1, 1, 0);
}
//
//
//

View File

@ -0,0 +1,51 @@
# Copyright © 2022 Konstantin Seurer
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
radix_sort_shaders = [
'init.comp',
'fill.comp',
'histogram.comp',
'prefix.comp',
'scatter_0_even.comp',
'scatter_0_odd.comp',
'scatter_1_even.comp',
'scatter_1_odd.comp'
]
shader_include_dir = meson.source_root() + '/src/amd/vulkan/radix_sort/targets/u64'
shader_include_files = files(
'bufref.h',
'prefix_limits.h',
'prefix.h',
'push.h',
'scatter.glsl',
meson.source_root() + '/src/amd/vulkan/radix_sort/targets/u64/config.h'
)
radix_sort_spv = []
foreach s : radix_sort_shaders
radix_sort_spv += custom_target(
s + '.spv.h',
input : s,
output : s + '.spv.h',
command : [prog_glslang, '-V', '-I' + shader_include_dir, '--target-env', 'spirv1.3', '-x', '-o', '@OUTPUT@', '@INPUT@'],
depend_files: shader_include_files)
endforeach

View File

@ -0,0 +1,194 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
// Prefix sum the coarse histograms.
//
// clang-format off
#extension GL_GOOGLE_include_directive : require
#extension GL_EXT_control_flow_attributes : require
#extension GL_KHR_shader_subgroup_arithmetic : require
#extension GL_KHR_shader_subgroup_ballot : require
// clang-format on
//
//
//
#include "config.h"
//
// Buffer reference macros and push constants
//
#include "bufref.h"
#include "push.h"
//
//
//
RS_STRUCT_PUSH_PREFIX();
layout(push_constant) uniform block_push
{
rs_push_prefix push;
};
//
// Subgroup uniform support
//
#if defined(RS_HISTOGRAM_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier)
#extension GL_EXT_subgroupuniform_qualifier : required
#define RS_SUBGROUP_UNIFORM subgroupuniformEXT
#else
#define RS_SUBGROUP_UNIFORM
#endif
//
// Check all switches are defined
//
//
#ifndef RS_PREFIX_SUBGROUP_SIZE_LOG2
#error "Undefined: RS_PREFIX_SUBGROUP_SIZE_LOG2"
#endif
//
#ifndef RS_PREFIX_WORKGROUP_SIZE_LOG2
#error "Undefined: RS_PREFIX_WORKGROUP_SIZE_LOG2"
#endif
//
// Local macros
//
// clang-format off
#define RS_KEYVAL_SIZE (RS_KEYVAL_DWORDS * 4)
#define RS_WORKGROUP_SIZE (1 << RS_PREFIX_WORKGROUP_SIZE_LOG2)
#define RS_SUBGROUP_SIZE (1 << RS_PREFIX_SUBGROUP_SIZE_LOG2)
#define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE)
// clang-format on
//
// There is no purpose in having a workgroup size larger than the
// radix size.
//
#if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)
#error "Error: (RS_WORKGROUP_SIZE > RS_RADIX_SIZE)"
#endif
//
//
//
layout(local_size_x = RS_WORKGROUP_SIZE) in;
//
// Histogram buffer reference
//
layout(buffer_reference, std430) buffer buffer_rs_histograms
{
uint32_t extent[];
};
//
// Load prefix limits before loading function
//
#include "prefix_limits.h"
//
// If multi-subgroup then define shared memory
//
#if (RS_WORKGROUP_SUBGROUPS > 1)
//----------------------------------------
shared uint32_t smem_sweep0[RS_SWEEP_0_SIZE];
#define RS_PREFIX_SWEEP0(idx_) smem_sweep0[idx_]
//----------------------------------------
#if (RS_SWEEP_1_SIZE > 0)
//----------------------------------------
shared uint32_t smem_sweep1[RS_SWEEP_1_SIZE];
#define RS_PREFIX_SWEEP1(idx_) smem_sweep1[idx_]
//----------------------------------------
#endif
#if (RS_SWEEP_2_SIZE > 0)
//----------------------------------------
shared uint32_t smem_sweep2[RS_SWEEP_2_SIZE];
#define RS_PREFIX_SWEEP2(idx_) smem_sweep2[idx_]
//----------------------------------------
#endif
#endif
//
// Define function arguments
//
#define RS_PREFIX_ARGS buffer_rs_histograms rs_histograms
//
// Define load/store functions
//
// clang-format off
#define RS_PREFIX_LOAD(idx_) rs_histograms.extent[idx_]
#define RS_PREFIX_STORE(idx_) rs_histograms.extent[idx_]
// clang-format on
//
// Load prefix function
//
#include "prefix.h"
//
// Exclusive prefix of uint32_t[256]
//
void
main()
{
//
// Define buffer reference to read histograms
//
#if (RS_WORKGROUP_SUBGROUPS == 1)
//
// Define histograms bufref for single subgroup
//
// NOTE(allanmac): The histogram buffer reference could be adjusted
// on the host to save a couple instructions at the cost of added
// complexity.
//
RS_SUBGROUP_UNIFORM
const uint32_t histograms_base = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
const uint32_t histograms_offset = (histograms_base + gl_SubgroupInvocationID) * 4;
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
rs_histograms,
push.devaddr_histograms,
histograms_offset);
#else
//
// Define histograms bufref for workgroup
//
RS_SUBGROUP_UNIFORM
const uint32_t histograms_base = ((RS_KEYVAL_SIZE - 1 - gl_WorkGroupID.x) * RS_RADIX_SIZE);
const uint32_t histograms_offset = (histograms_base + gl_LocalInvocationID.x) * 4;
RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms,
rs_histograms,
push.devaddr_histograms,
histograms_offset);
#endif
//
// Compute exclusive prefix of uint32_t[256]
//
rs_prefix(rs_histograms);
}
//
//
//

View File

@ -0,0 +1,353 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_
//
// Requires several defines
//
#ifndef RS_PREFIX_LIMITS
#error "Error: \"prefix_limits.h\" not loaded"
#endif
#ifndef RS_PREFIX_ARGS
#error "Error: RS_PREFIX_ARGS undefined"
#endif
#ifndef RS_PREFIX_LOAD
#error "Error: RS_PREFIX_LOAD undefined"
#endif
#ifndef RS_PREFIX_STORE
#error "Error: RS_PREFIX_STORE undefined"
#endif
#ifndef RS_SUBGROUP_SIZE
#error "Error: RS_SUBGROUP_SIZE undefined"
#endif
#ifndef RS_WORKGROUP_SIZE
#error "Error: RS_WORKGROUP_SIZE undefined"
#endif
#ifndef RS_WORKGROUP_SUBGROUPS
#error "Error: RS_WORKGROUP_SUBGROUPS undefined"
#endif
//
// Optional switches:
//
// * Disable holding original inclusively scanned histogram values in registers.
//
// #define RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
//
//
// Compute exclusive prefix of uint32_t[256]
//
void
rs_prefix(RS_PREFIX_ARGS)
{
#if (RS_WORKGROUP_SUBGROUPS == 1)
//
// Workgroup is a single subgroup so no shared memory is required.
//
//
// Exclusive scan-add the histogram
//
const uint32_t h0 = RS_PREFIX_LOAD(0);
const uint32_t h0_inc = subgroupInclusiveAdd(h0);
RS_SUBGROUP_UNIFORM uint32_t h_last = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
RS_PREFIX_STORE(0) = h0_inc - h0; // exclusive
//
// Each iteration is dependent on the previous so no unrolling. The
// compiler is free to hoist the loads upward though.
//
for (RS_SUBGROUP_UNIFORM uint32_t ii = RS_SUBGROUP_SIZE; //
ii < RS_RADIX_SIZE;
ii += RS_SUBGROUP_SIZE)
{
const uint32_t h = RS_PREFIX_LOAD(ii);
const uint32_t h_inc = subgroupInclusiveAdd(h) + h_last;
h_last = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
RS_PREFIX_STORE(ii) = h_inc - h; // exclusive
}
#else
//
// Workgroup is multiple subgroups and uses shared memory to store
// the scan's intermediate results.
//
// Assumes a power-of-two subgroup, workgroup and radix size.
//
// Downsweep: Repeatedly scan reductions until they fit in a single
// subgroup.
//
// Upsweep: Then uniformly apply reductions to each subgroup.
//
//
// Subgroup Size | 4 | 8 | 16 | 32 | 64 |
// --------------+----+----+----+----+----+
// Sweep 0 | 64 | 32 | 16 | 8 | 4 | sweep_0[]
// Sweep 1 | 16 | 4 | - | - | - | sweep_1[]
// Sweep 2 | 4 | - | - | - | - | sweep_2[]
// --------------+----+----+----+----+----+
// Total dwords | 84 | 36 | 16 | 8 | 4 |
// --------------+----+----+----+----+----+
//
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
uint32_t h_exc[RS_H_COMPONENTS];
#endif
//
// Downsweep 0
//
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
{
const uint32_t h = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
const uint32_t h_inc = subgroupInclusiveAdd(h);
const uint32_t smem_idx = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
RS_PREFIX_SWEEP0(smem_idx) = subgroupBroadcast(h_inc, RS_SUBGROUP_SIZE - 1);
//
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
h_exc[ii] = h_inc - h;
#else
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_inc - h;
#endif
}
barrier();
//
// Skip generalizing these sweeps for all possible subgroups -- just
// write them directly.
//
#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
//////////////////////////////////////////////////////////////////////
//
// Scan 0
//
#if (RS_SWEEP_0_SIZE != RS_SUBGROUP_SIZE)
if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // subgroup has inactive invocations
#endif
{
const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
}
#elif (RS_SUBGROUP_SIZE == 8)
#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
//////////////////////////////////////////////////////////////////////
//
// Scan 0 and Downsweep 1
//
if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 32 invocations
{
const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
}
#else
//////////////////////////////////////////////////////////////////////
//
// Scan 0 and Downsweep 1
//
[[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 32 invocations
{
const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
}
#endif
barrier();
//
// Scan 1
//
if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 4 invocations
{
const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
}
#elif (RS_SUBGROUP_SIZE == 4)
//////////////////////////////////////////////////////////////////////
//
// Scan 0 and Downsweep 1
//
#if (RS_SWEEP_0_SIZE < RS_WORKGROUP_SIZE)
if (gl_LocalInvocationID.x < RS_SWEEP_0_SIZE) // 64 invocations
{
const uint32_t h0_red = RS_PREFIX_SWEEP0(gl_LocalInvocationID.x);
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
RS_PREFIX_SWEEP0(gl_LocalInvocationID.x) = h0_inc - h0_red;
RS_PREFIX_SWEEP1(gl_SubgroupID) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
}
#else
[[unroll]] for (uint32_t ii = 0; ii < RS_S0_PASSES; ii++) // 64 invocations
{
const uint32_t idx0 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
const uint32_t idx1 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
const uint32_t h0_red = RS_PREFIX_SWEEP0(idx0);
const uint32_t h0_inc = subgroupInclusiveAdd(h0_red);
RS_PREFIX_SWEEP0(idx0) = h0_inc - h0_red;
RS_PREFIX_SWEEP1(idx1) = subgroupBroadcast(h0_inc, RS_SUBGROUP_SIZE - 1);
}
#endif
barrier();
//
// Scan 1 and Downsweep 2
//
#if (RS_SWEEP_1_SIZE < RS_WORKGROUP_SIZE)
if (gl_LocalInvocationID.x < RS_SWEEP_1_SIZE) // 16 invocations
{
const uint32_t h1_red = RS_PREFIX_SWEEP1(gl_LocalInvocationID.x);
const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
RS_PREFIX_SWEEP1(gl_LocalInvocationID.x) = h1_inc - h1_red;
RS_PREFIX_SWEEP2(gl_SubgroupID) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
}
#else
[[unroll]] for (uint32_t ii = 0; ii < RS_S1_PASSES; ii++) // 16 invocations
{
const uint32_t idx1 = (ii * RS_WORKGROUP_SIZE) + gl_LocalInvocationID.x;
const uint32_t idx2 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
const uint32_t h1_red = RS_PREFIX_SWEEP1(idx1);
const uint32_t h1_inc = subgroupInclusiveAdd(h1_red);
RS_PREFIX_SWEEP1(idx1) = h1_inc - h1_red;
RS_PREFIX_SWEEP2(idx2) = subgroupBroadcast(h1_inc, RS_SUBGROUP_SIZE - 1);
}
#endif
barrier();
//
// Scan 2
//
// 4 invocations
//
if (gl_LocalInvocationID.x < RS_SWEEP_2_SIZE)
{
const uint32_t h2_red = RS_PREFIX_SWEEP2(gl_LocalInvocationID.x);
const uint32_t h2_inc = subgroupInclusiveAdd(h2_red);
RS_PREFIX_SWEEP2(gl_LocalInvocationID.x) = h2_inc - h2_red;
}
#else
#error "Error: Unsupported subgroup size"
#endif
barrier();
//////////////////////////////////////////////////////////////////////
//
// Final upsweep 0
//
#if ((RS_SUBGROUP_SIZE == 64) || (RS_SUBGROUP_SIZE == 32) || (RS_SUBGROUP_SIZE == 16))
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
{
const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
// clang format issue
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc[ii] + RS_PREFIX_SWEEP0(idx0);
#else
const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) = h_exc + RS_PREFIX_SWEEP0(idx0);
#endif
}
#elif (RS_SUBGROUP_SIZE == 8)
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
{
const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
h_exc[ii] + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
#else
const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
h_exc + RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1);
#endif
}
#elif (RS_SUBGROUP_SIZE == 4)
[[unroll]] for (uint32_t ii = 0; ii < RS_H_COMPONENTS; ii++)
{
const uint32_t idx0 = (ii * RS_WORKGROUP_SUBGROUPS) + gl_SubgroupID;
const uint32_t idx1 = idx0 / RS_SUBGROUP_SIZE;
const uint32_t idx2 = idx1 / RS_SUBGROUP_SIZE;
#ifndef RS_PREFIX_DISABLE_COMPONENTS_IN_REGISTERS
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
h_exc[ii] + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
#else
const uint32_t h_exc = RS_PREFIX_LOAD(ii * RS_WORKGROUP_SIZE);
RS_PREFIX_STORE(ii * RS_WORKGROUP_SIZE) =
h_exc + (RS_PREFIX_SWEEP0(idx0) + RS_PREFIX_SWEEP1(idx1) + RS_PREFIX_SWEEP2(idx2));
#endif
}
#else
#error "Error: Unsupported subgroup size"
#endif
#endif
}
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_H_

View File

@ -0,0 +1,48 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_
//
// Define various prefix limits
//
#define RS_PREFIX_LIMITS
//
// Multi-subgroup prefix requires shared memory.
//
#if (RS_WORKGROUP_SUBGROUPS > 1)
// clang-format off
#define RS_H_COMPONENTS (RS_RADIX_SIZE / RS_WORKGROUP_SIZE)
#define RS_SWEEP_0_SIZE (RS_RADIX_SIZE / RS_SUBGROUP_SIZE)
#define RS_SWEEP_1_SIZE (RS_SWEEP_0_SIZE / RS_SUBGROUP_SIZE)
#define RS_SWEEP_2_SIZE (RS_SWEEP_1_SIZE / RS_SUBGROUP_SIZE)
#define RS_SWEEP_SIZE (RS_SWEEP_0_SIZE + RS_SWEEP_1_SIZE + RS_SWEEP_2_SIZE)
#define RS_S0_PASSES (RS_SWEEP_0_SIZE / RS_WORKGROUP_SIZE)
#define RS_S1_PASSES (RS_SWEEP_1_SIZE / RS_WORKGROUP_SIZE)
#define RS_SWEEP_0_OFFSET 0
#define RS_SWEEP_1_OFFSET (RS_SWEEP_0_OFFSET + RS_SWEEP_0_SIZE)
#define RS_SWEEP_2_OFFSET (RS_SWEEP_1_OFFSET + RS_SWEEP_1_SIZE)
// clang-format on
//
// Single subgroup prefix doesn't use shared memory.
//
#else
#define RS_SWEEP_SIZE 0
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PREFIX_LIMITS_H_

View File

@ -0,0 +1,263 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_
//
// There is a limit to the maximum number of keyvals that can be sorted because
// the top 2 bits in the atomic lookback counters are used as tag bits.
//
#define RS_MAX_KEYVALS ((1 << 30) - 1)
//
// Right now, the entire implementation is very much dependent on an 8-bit radix
// size. Most of the shaders attempt to honor this defined size but there are
// still a number of places where 256 is assumed.
//
#define RS_RADIX_LOG2 8
#define RS_RADIX_SIZE (1 << RS_RADIX_LOG2)
//
// LOOKBACK STATUS FLAGS
//
// The decoupled lookback status flags are stored in the two
// high bits of the count:
//
// 0 31
// | REDUCTION OR PREFIX COUNT | STATUS |
// +---------------------------+--------+
// | 30 | 2 |
//
// This limits the keyval extent size to (2^30-1).
//
// Valid status flags are:
//
// EVEN PASS ODD PASS
// ----------------------- -----------------------
// 0 : invalid 0 : prefix available
// 1 : reduction available 1 : ---
// 2 : prefix available 2 : invalid
// 3 : --- 3 : reduction available
//
// Atomically adding +1 to a "reduction available" status results in a "prefix
// available" status.
//
// clang-format off
#define RS_PARTITION_STATUS_EVEN_INVALID (0u)
#define RS_PARTITION_STATUS_EVEN_REDUCTION (1u)
#define RS_PARTITION_STATUS_EVEN_PREFIX (2u)
#define RS_PARTITION_STATUS_ODD_INVALID (2u)
#define RS_PARTITION_STATUS_ODD_REDUCTION (3u)
#define RS_PARTITION_STATUS_ODD_PREFIX (0u)
// clang-format on
//
// Arguments to indirectly launched shaders.
//
// struct rs_indirect_info_dispatch
// {
// u32vec4 pad;
// u32vec4 zero;
// u32vec4 histogram;
// u32vec4 scatter;
// };
//
// struct rs_indirect_info_fill
// {
// uint32_t block_offset;
// uint32_t dword_offset_min;
// uint32_t dword_offset_max_minus_min;
// uint32_t reserved; // padding for 16 bytes
// };
//
// struct rs_indirect_info
// {
// rs_indirect_info_fill pad;
// rs_indirect_info_fill zero;
// rs_indirect_info_dispatch dispatch;
// };
//
#define RS_STRUCT_INDIRECT_INFO_DISPATCH() \
struct rs_indirect_info_dispatch \
{ \
RS_STRUCT_MEMBER_STRUCT(u32vec4, pad) \
RS_STRUCT_MEMBER_STRUCT(u32vec4, zero) \
RS_STRUCT_MEMBER_STRUCT(u32vec4, histogram) \
RS_STRUCT_MEMBER_STRUCT(u32vec4, scatter) \
}
#define RS_STRUCT_INDIRECT_INFO_FILL() \
struct rs_indirect_info_fill \
{ \
RS_STRUCT_MEMBER(uint32_t, block_offset) \
RS_STRUCT_MEMBER(uint32_t, dword_offset_min) \
RS_STRUCT_MEMBER(uint32_t, dword_offset_max_minus_min) \
RS_STRUCT_MEMBER(uint32_t, reserved) \
}
#define RS_STRUCT_INDIRECT_INFO() \
RS_STRUCT_INDIRECT_INFO_DISPATCH(); \
RS_STRUCT_INDIRECT_INFO_FILL(); \
struct rs_indirect_info \
{ \
RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_fill, pad) \
RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_fill, zero) \
RS_STRUCT_MEMBER_STRUCT(rs_indirect_info_dispatch, dispatch) \
}
//
// Define the push constant structures shared by the host and device.
//
// INIT
// ----
// struct rs_push_init
// {
// uint64_t devaddr_count; // address of count buffer
// uint64_t devaddr_indirect; // address of indirect info buffer
// };
//
// FILL
// ----
// struct rs_push_fill
// {
// uint64_t devaddr_info; // address of indirect info for fill shader
// uint64_t devaddr_dwords; // address of dwords extent
// uint32_t dword; // dword value used to fill the dwords extent
// };
//
// HISTOGRAM
// ---------
// struct rs_push_histogram
// {
// uint64_t devaddr_histograms; // address of histograms extent
// uint64_t devaddr_keyvals; // address of keyvals extent
// uint32_t passes; // number of passes
// };
//
// PREFIX
// ------
// struct rs_push_prefix
// {
// uint64_t devaddr_histograms; // address of histograms extent
// };
//
// SCATTER
// -------
// struct rs_push_scatter
// {
// uint64_t devaddr_keyvals_in; // address of input keyvals
// uint64_t devaddr_keyvals_out; // address of output keyvals
// uint64_t devaddr_partitions // address of partitions
// uint64_t devaddr_histogram; // address of pass histogram
// uint32_t pass_offset; // keyval pass offset
// };
//
#define RS_STRUCT_PUSH_INIT() \
struct rs_push_init \
{ \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_info) \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_count) \
RS_STRUCT_MEMBER(uint32_t, passes) \
}
#define RS_STRUCT_PUSH_FILL() \
struct rs_push_fill \
{ \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_info) \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_dwords) \
RS_STRUCT_MEMBER(uint32_t, dword) \
}
#define RS_STRUCT_PUSH_HISTOGRAM() \
struct rs_push_histogram \
{ \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms) \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals) \
RS_STRUCT_MEMBER(uint32_t, passes) \
}
#define RS_STRUCT_PUSH_PREFIX() \
struct rs_push_prefix \
{ \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms) \
}
#define RS_STRUCT_PUSH_SCATTER() \
struct rs_push_scatter \
{ \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals_even) \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_keyvals_odd) \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_partitions) \
RS_STRUCT_MEMBER(RS_DEVADDR, devaddr_histograms) \
RS_STRUCT_MEMBER(uint32_t, pass_offset) \
}
////////////////////////////////////////////////////////////////////
//
// GLSL
//
#ifdef VULKAN // defined by GLSL/VK compiler
// clang-format off
#define RS_STRUCT_MEMBER(type_, name_) type_ name_;
#define RS_STRUCT_MEMBER_FARRAY(type_, len_, name_) type_ name_[len_];
#define RS_STRUCT_MEMBER_STRUCT(type_, name_) type_ name_;
// clang-format on
////////////////////////////////////////////////////////////////////
//
// C/C++
//
#else
#ifdef __cplusplus
extern "C" {
#endif
//
//
//
#include <stdint.h>
struct u32vec4
{
uint32_t x;
uint32_t y;
uint32_t z;
uint32_t w;
};
// clang-format off
#define RS_DEVADDR uint64_t
#define RS_STRUCT_MEMBER(type_, name_) type_ name_;
#define RS_STRUCT_MEMBER_FARRAY(type_, len_, name_) type_ name_[len_];
#define RS_STRUCT_MEMBER_STRUCT(type_, name_) struct type_ name_;
// clang-format on
RS_STRUCT_PUSH_INIT();
RS_STRUCT_PUSH_FILL();
RS_STRUCT_PUSH_HISTOGRAM();
RS_STRUCT_PUSH_PREFIX();
RS_STRUCT_PUSH_SCATTER();
RS_STRUCT_INDIRECT_INFO();
//
//
//
#ifdef __cplusplus
}
#endif
#endif
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_SHADERS_PUSH_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,36 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
//
//
// clang-format off
#define RS_SCATTER_KEYVAL_DWORD_BASE 0
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_EVEN_INVALID
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_EVEN_REDUCTION
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_EVEN_PREFIX
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_even
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_odd
// clang-format on
//
//
//
#extension GL_GOOGLE_include_directive : require
//
//
//
#include "scatter.glsl"
//
//
//

View File

@ -0,0 +1,36 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
//
//
// clang-format off
#define RS_SCATTER_KEYVAL_DWORD_BASE 0
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_ODD_INVALID
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_ODD_REDUCTION
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_ODD_PREFIX
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_odd
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_even
// clang-format on
//
//
//
#extension GL_GOOGLE_include_directive : require
//
//
//
#include "scatter.glsl"
//
//
//

View File

@ -0,0 +1,36 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
//
//
// clang-format off
#define RS_SCATTER_KEYVAL_DWORD_BASE 1
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_EVEN_INVALID
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_EVEN_REDUCTION
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_EVEN_PREFIX
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_even
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_odd
// clang-format on
//
//
//
#extension GL_GOOGLE_include_directive : require
//
//
//
#include "scatter.glsl"
//
//
//

View File

@ -0,0 +1,36 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#version 460
//
//
//
// clang-format off
#define RS_SCATTER_KEYVAL_DWORD_BASE 1
#define RS_PARTITION_STATUS_INVALID RS_PARTITION_STATUS_ODD_INVALID
#define RS_PARTITION_STATUS_REDUCTION RS_PARTITION_STATUS_ODD_REDUCTION
#define RS_PARTITION_STATUS_PREFIX RS_PARTITION_STATUS_ODD_PREFIX
#define RS_DEVADDR_KEYVALS_IN(push_) push_.devaddr_keyvals_odd
#define RS_DEVADDR_KEYVALS_OUT(push_) push_.devaddr_keyvals_even
// clang-format on
//
//
//
#extension GL_GOOGLE_include_directive : require
//
//
//
#include "scatter.glsl"
//
//
//

View File

@ -0,0 +1,57 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_
//
//
//
#include <stdint.h>
//
// This structure packages target-specific configuration parameters.
//
struct radix_sort_vk_target_config
{
uint32_t keyval_dwords;
struct
{
uint32_t workgroup_size_log2;
} init;
struct
{
uint32_t workgroup_size_log2;
} fill;
struct
{
uint32_t workgroup_size_log2;
uint32_t subgroup_size_log2;
uint32_t block_rows;
} histogram;
struct
{
uint32_t workgroup_size_log2;
uint32_t subgroup_size_log2;
} prefix;
struct
{
uint32_t workgroup_size_log2;
uint32_t subgroup_size_log2;
uint32_t block_rows;
} scatter;
};
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGET_H_

View File

@ -0,0 +1,34 @@
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
#define SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_
//
//
//
// clang-format off
#define RS_KEYVAL_DWORDS 2
#define RS_FILL_WORKGROUP_SIZE_LOG2 7
#define RS_FILL_BLOCK_ROWS 8
#define RS_HISTOGRAM_WORKGROUP_SIZE_LOG2 8
#define RS_HISTOGRAM_SUBGROUP_SIZE_LOG2 6
#define RS_HISTOGRAM_BLOCK_ROWS 14
#define RS_PREFIX_WORKGROUP_SIZE_LOG2 8
#define RS_PREFIX_SUBGROUP_SIZE_LOG2 6
#define RS_SCATTER_WORKGROUP_SIZE_LOG2 8
#define RS_SCATTER_SUBGROUP_SIZE_LOG2 6
#define RS_SCATTER_BLOCK_ROWS 14
// clang-format on
//
//
//
#endif // SRC_GRAPHICS_LIB_COMPUTE_RADIX_SORT_PLATFORMS_VK_TARGETS_VENDORS_AMD_GCN3_U64_CONFIG_H_