radv: Implement device-side BVH building.

Same naive algorithm as the host build.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11078>
This commit is contained in:
Bas Nieuwenhuizen 2021-05-18 13:25:00 +02:00 committed by Marge Bot
parent d51a4b4c4b
commit 0dad88b469
4 changed files with 847 additions and 0 deletions

View File

@ -23,6 +23,9 @@
#include "radv_private.h"
#include "util/half_float.h"
#include "nir_builder.h"
#include "radv_cs.h"
#include "radv_meta.h"
struct radv_accel_struct_header {
uint32_t root_node_offset;
@ -589,3 +592,830 @@ radv_BuildAccelerationStructuresKHR(
}
return result;
}
static nir_ssa_def *
get_indices(nir_builder *b, nir_ssa_def *addr, nir_ssa_def *type, nir_ssa_def *id)
{
const struct glsl_type *uvec3_type = glsl_vector_type(GLSL_TYPE_UINT, 3);
nir_variable *result =
nir_variable_create(b->shader, nir_var_shader_temp, uvec3_type, "indices");
nir_push_if(b, nir_ult(b, type, nir_imm_int(b, 2)));
nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_UINT16)));
{
nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 6));
nir_ssa_def *indices[3];
for (unsigned i = 0; i < 3; ++i) {
indices[i] = nir_build_load_global(
b, 1, 16, nir_iadd(b, addr, nir_u2u64(b, nir_iadd(b, index_id, nir_imm_int(b, 2 * i)))),
.align_mul = 2, .align_offset = 0);
}
nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7);
}
nir_push_else(b, NULL);
{
nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 12));
nir_ssa_def *indices = nir_build_load_global(
b, 3, 32, nir_iadd(b, addr, nir_u2u64(b, index_id)), .align_mul = 4, .align_offset = 0);
nir_store_var(b, result, indices, 7);
}
nir_pop_if(b, NULL);
nir_push_else(b, NULL);
{
nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 3));
nir_ssa_def *indices[] = {
index_id,
nir_iadd(b, index_id, nir_imm_int(b, 1)),
nir_iadd(b, index_id, nir_imm_int(b, 2)),
};
nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_NONE_KHR)));
{
nir_store_var(b, result, nir_vec(b, indices, 3), 7);
}
nir_push_else(b, NULL);
{
for (unsigned i = 0; i < 3; ++i) {
indices[i] = nir_build_load_global(b, 1, 8, nir_iadd(b, addr, nir_u2u64(b, indices[i])),
.align_mul = 1, .align_offset = 0);
}
nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
return nir_load_var(b, result);
}
static void
get_vertices(nir_builder *b, nir_ssa_def *addresses, nir_ssa_def *format, nir_ssa_def *positions[3])
{
const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
nir_variable *results[3] = {
nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex0"),
nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex1"),
nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex2")};
VkFormat formats[] = {
VK_FORMAT_R32G32B32_SFLOAT,
VK_FORMAT_R32G32B32A32_SFLOAT,
VK_FORMAT_R16G16B16_SFLOAT,
VK_FORMAT_R16G16B16A16_SFLOAT,
};
for (unsigned f = 0; f < ARRAY_SIZE(formats); ++f) {
if (f + 1 < ARRAY_SIZE(formats))
nir_push_if(b, nir_ieq(b, format, nir_imm_int(b, formats[f])));
for (unsigned i = 0; i < 3; ++i) {
switch (formats[f]) {
case VK_FORMAT_R32G32B32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SFLOAT:
nir_store_var(b, results[i],
nir_build_load_global(b, 3, 32, nir_channel(b, addresses, i),
.align_mul = 4, .align_offset = 0),
7);
break;
case VK_FORMAT_R16G16B16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SFLOAT: {
nir_ssa_def *values[3];
nir_ssa_def *addr = nir_channel(b, addresses, i);
for (unsigned j = 0; j < 3; ++j)
values[j] =
nir_build_load_global(b, 1, 16, nir_iadd(b, addr, nir_imm_int64(b, j * 2)),
.align_mul = 2, .align_offset = 0);
nir_store_var(b, results[i], nir_f2f32(b, nir_vec(b, values, 3)), 7);
break;
}
default:
unreachable("Unhandled format");
}
}
if (f + 1 < ARRAY_SIZE(formats))
nir_push_else(b, NULL);
}
for (unsigned f = 1; f < ARRAY_SIZE(formats); ++f) {
nir_pop_if(b, NULL);
}
for (unsigned i = 0; i < 3; ++i)
positions[i] = nir_load_var(b, results[i]);
}
struct build_primitive_constants {
uint64_t node_dst_addr;
uint64_t scratch_addr;
uint32_t dst_offset;
uint32_t dst_scratch_offset;
uint32_t geometry_type;
uint32_t geometry_id;
union {
struct {
uint64_t vertex_addr;
uint64_t index_addr;
uint64_t transform_addr;
uint32_t vertex_stride;
uint32_t vertex_format;
uint32_t index_format;
};
struct {
uint64_t instance_data;
};
struct {
uint64_t aabb_addr;
uint32_t aabb_stride;
};
};
};
struct build_internal_constants {
uint64_t node_dst_addr;
uint64_t scratch_addr;
uint32_t dst_offset;
uint32_t dst_scratch_offset;
uint32_t src_scratch_offset;
uint32_t fill_header;
};
/* This inverts a 3x3 matrix using cofactors, as in e.g.
* https://www.mathsisfun.com/algebra/matrix-inverse-minors-cofactors-adjugate.html */
static void
nir_invert_3x3(nir_builder *b, nir_ssa_def *in[3][3], nir_ssa_def *out[3][3])
{
nir_ssa_def *cofactors[3][3];
for (unsigned i = 0; i < 3; ++i) {
for (unsigned j = 0; j < 3; ++j) {
cofactors[i][j] =
nir_fsub(b, nir_fmul(b, in[(i + 1) % 3][(j + 1) % 3], in[(i + 2) % 3][(j + 2) % 3]),
nir_fmul(b, in[(i + 1) % 3][(j + 2) % 3], in[(i + 2) % 3][(j + 1) % 3]));
}
}
nir_ssa_def *det = NULL;
for (unsigned i = 0; i < 3; ++i) {
nir_ssa_def *det_part = nir_fmul(b, in[0][i], cofactors[0][i]);
det = det ? nir_fadd(b, det, det_part) : det_part;
}
nir_ssa_def *det_inv = nir_frcp(b, det);
for (unsigned i = 0; i < 3; ++i) {
for (unsigned j = 0; j < 3; ++j) {
out[i][j] = nir_fmul(b, cofactors[j][i], det_inv);
}
}
}
static nir_shader *
build_leaf_shader(struct radv_device *dev)
{
const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_leaf_shader");
b.shader->info.workgroup_size[0] = 64;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
nir_ssa_def *pconst0 =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
nir_ssa_def *pconst1 =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16);
nir_ssa_def *pconst2 =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 32, .range = 16);
nir_ssa_def *pconst3 =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 48, .range = 16);
nir_ssa_def *pconst4 =
nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 64, .range = 4);
nir_ssa_def *geom_type = nir_channel(&b, pconst1, 2);
nir_ssa_def *node_dst_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3));
nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12));
nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0);
nir_ssa_def *scratch_offset = nir_channel(&b, pconst1, 1);
nir_ssa_def *geometry_id = nir_channel(&b, pconst1, 3);
nir_ssa_def *global_id =
nir_iadd(&b,
nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),
nir_imm_int(&b, b.shader->info.workgroup_size[0])),
nir_channels(&b, nir_load_local_invocation_id(&b), 1));
scratch_addr = nir_iadd(
&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, scratch_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 4)))));
nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_TRIANGLES_KHR)));
{ /* Triangles */
nir_ssa_def *vertex_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3));
nir_ssa_def *index_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 12));
nir_ssa_def *transform_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst3, 3));
nir_ssa_def *vertex_stride = nir_channel(&b, pconst3, 2);
nir_ssa_def *vertex_format = nir_channel(&b, pconst3, 3);
nir_ssa_def *index_format = nir_channel(&b, pconst4, 0);
unsigned repl_swizzle[4] = {0, 0, 0, 0};
nir_ssa_def *node_offset =
nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64)));
nir_ssa_def *triangle_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
nir_ssa_def *indices = get_indices(&b, index_addr, index_format, global_id);
nir_ssa_def *vertex_addresses = nir_iadd(
&b, nir_u2u64(&b, nir_imul(&b, indices, nir_swizzle(&b, vertex_stride, repl_swizzle, 3))),
nir_swizzle(&b, vertex_addr, repl_swizzle, 3));
nir_ssa_def *positions[3];
get_vertices(&b, vertex_addresses, vertex_format, positions);
nir_ssa_def *node_data[16];
memset(node_data, 0, sizeof(node_data));
nir_variable *transform[] = {
nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform0"),
nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform1"),
nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform2"),
};
nir_store_var(&b, transform[0], nir_imm_vec4(&b, 1.0, 0.0, 0.0, 0.0), 0xf);
nir_store_var(&b, transform[1], nir_imm_vec4(&b, 0.0, 1.0, 0.0, 0.0), 0xf);
nir_store_var(&b, transform[2], nir_imm_vec4(&b, 0.0, 0.0, 1.0, 0.0), 0xf);
nir_push_if(&b, nir_ine(&b, transform_addr, nir_imm_int64(&b, 0)));
nir_store_var(
&b, transform[0],
nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 0)),
.align_mul = 4, .align_offset = 0),
0xf);
nir_store_var(
&b, transform[1],
nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 16)),
.align_mul = 4, .align_offset = 0),
0xf);
nir_store_var(
&b, transform[2],
nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 32)),
.align_mul = 4, .align_offset = 0),
0xf);
nir_pop_if(&b, NULL);
for (unsigned i = 0; i < 3; ++i)
for (unsigned j = 0; j < 3; ++j)
node_data[i * 3 + j] = nir_fdph(&b, positions[i], nir_load_var(&b, transform[j]));
node_data[12] = global_id;
node_data[13] = geometry_id;
node_data[15] = nir_imm_int(&b, 9);
for (unsigned i = 0; i < ARRAY_SIZE(node_data); ++i)
if (!node_data[i])
node_data[i] = nir_imm_int(&b, 0);
for (unsigned i = 0; i < 4; ++i) {
nir_build_store_global(&b, nir_vec(&b, node_data + i * 4, 4),
nir_iadd(&b, triangle_node_dst_addr, nir_imm_int64(&b, i * 16)),
.write_mask = 15, .align_mul = 16, .align_offset = 0);
}
nir_ssa_def *node_id = nir_ushr(&b, node_offset, nir_imm_int(&b, 3));
nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,
.align_offset = 0);
}
nir_push_else(&b, NULL);
nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_AABBS_KHR)));
{ /* AABBs */
nir_ssa_def *aabb_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3));
nir_ssa_def *aabb_stride = nir_channel(&b, pconst2, 2);
nir_ssa_def *node_offset =
nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64)));
nir_ssa_def *aabb_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
nir_ssa_def *node_id =
nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 7));
nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,
.align_offset = 0);
aabb_addr = nir_iadd(&b, aabb_addr, nir_u2u64(&b, nir_imul(&b, aabb_stride, global_id)));
nir_ssa_def *min_bound =
nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 0)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *max_bound =
nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 12)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *values[] = {nir_channel(&b, min_bound, 0),
nir_channel(&b, min_bound, 1),
nir_channel(&b, min_bound, 2),
nir_channel(&b, max_bound, 0),
nir_channel(&b, max_bound, 1),
nir_channel(&b, max_bound, 2),
global_id,
geometry_id};
nir_build_store_global(&b, nir_vec(&b, values + 0, 4),
nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 0)),
.write_mask = 15, .align_mul = 16, .align_offset = 0);
nir_build_store_global(&b, nir_vec(&b, values + 4, 4),
nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 16)),
.write_mask = 15, .align_mul = 16, .align_offset = 0);
}
nir_push_else(&b, NULL);
{ /* Instances */
nir_ssa_def *instance_addr =
nir_iadd(&b, nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)),
nir_u2u64(&b, nir_imul(&b, global_id, nir_imm_int(&b, 64))));
nir_ssa_def *inst_transform[] = {
nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 0)),
.align_mul = 4, .align_offset = 0),
nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 16)),
.align_mul = 4, .align_offset = 0),
nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 32)),
.align_mul = 4, .align_offset = 0)};
nir_ssa_def *inst3 =
nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 48)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *node_offset =
nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 128)));
node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset));
nir_ssa_def *node_id =
nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 6));
nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4,
.align_offset = 0);
nir_variable *bounds[2] = {
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
};
nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
nir_ssa_def *header_addr = nir_pack_64_2x32(&b, nir_channels(&b, inst3, 12));
nir_push_if(&b, nir_ine(&b, header_addr, nir_imm_int64(&b, 0)));
nir_ssa_def *header_root_offset =
nir_build_load_global(&b, 1, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 0)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *header_min =
nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 8)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *header_max =
nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 20)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *bound_defs[2][3];
for (unsigned i = 0; i < 3; ++i) {
bound_defs[0][i] = bound_defs[1][i] = nir_channel(&b, inst_transform[i], 3);
nir_ssa_def *mul_a = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_min);
nir_ssa_def *mul_b = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_max);
nir_ssa_def *mi = nir_fmin(&b, mul_a, mul_b);
nir_ssa_def *ma = nir_fmax(&b, mul_a, mul_b);
for (unsigned j = 0; j < 3; ++j) {
bound_defs[0][i] = nir_fadd(&b, bound_defs[0][i], nir_channel(&b, mi, j));
bound_defs[1][i] = nir_fadd(&b, bound_defs[1][i], nir_channel(&b, ma, j));
}
}
nir_store_var(&b, bounds[0], nir_vec(&b, bound_defs[0], 3), 7);
nir_store_var(&b, bounds[1], nir_vec(&b, bound_defs[1], 3), 7);
nir_ssa_def *m_in[3][3], *m_out[3][3], *m_vec[3][4];
for (unsigned i = 0; i < 3; ++i)
for (unsigned j = 0; j < 3; ++j)
m_in[i][j] = nir_channel(&b, inst_transform[i], j);
nir_invert_3x3(&b, m_in, m_out);
for (unsigned i = 0; i < 3; ++i) {
for (unsigned j = 0; j < 3; ++j)
m_vec[i][j] = m_out[i][j];
m_vec[i][3] = nir_channel(&b, inst_transform[i], 3);
}
for (unsigned i = 0; i < 3; ++i) {
nir_build_store_global(&b, nir_vec(&b, m_vec[i], 4),
nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 16 * i)),
.write_mask = 0xf, .align_mul = 4, .align_offset = 0);
}
nir_ssa_def *out0[4] = {
nir_ior(&b, nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 0), header_root_offset),
nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 1), nir_channel(&b, inst3, 0),
nir_channel(&b, inst3, 1)};
nir_build_store_global(&b, nir_vec(&b, out0, 4),
nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)), .write_mask = 0xf,
.align_mul = 4, .align_offset = 0);
nir_build_store_global(&b, global_id, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 88)),
.write_mask = 0x1, .align_mul = 4, .align_offset = 0);
nir_pop_if(&b, NULL);
nir_build_store_global(&b, nir_load_var(&b, bounds[0]),
nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 64)), .write_mask = 0x7,
.align_mul = 4, .align_offset = 0);
nir_build_store_global(&b, nir_load_var(&b, bounds[1]),
nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 76)), .write_mask = 0x7,
.align_mul = 4, .align_offset = 0);
}
nir_pop_if(&b, NULL);
nir_pop_if(&b, NULL);
return b.shader;
}
static void
determine_bounds(nir_builder *b, nir_ssa_def *node_addr, nir_ssa_def *node_id,
nir_variable *bounds_vars[2])
{
nir_ssa_def *node_type = nir_iand(b, node_id, nir_imm_int(b, 7));
node_addr = nir_iadd(
b, node_addr,
nir_u2u64(b, nir_ishl(b, nir_iand(b, node_id, nir_imm_int(b, ~7u)), nir_imm_int(b, 3))));
nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 0)));
{
nir_ssa_def *positions[3];
for (unsigned i = 0; i < 3; ++i)
positions[i] =
nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *bounds[] = {positions[0], positions[0]};
for (unsigned i = 1; i < 3; ++i) {
bounds[0] = nir_fmin(b, bounds[0], positions[i]);
bounds[1] = nir_fmax(b, bounds[1], positions[i]);
}
nir_store_var(b, bounds_vars[0], bounds[0], 7);
nir_store_var(b, bounds_vars[1], bounds[1], 7);
}
nir_push_else(b, NULL);
nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 5)));
{
nir_ssa_def *input_bounds[4][2];
for (unsigned i = 0; i < 4; ++i)
for (unsigned j = 0; j < 2; ++j)
input_bounds[i][j] = nir_build_load_global(
b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 16 + i * 24 + j * 12)),
.align_mul = 4, .align_offset = 0);
nir_ssa_def *bounds[] = {input_bounds[0][0], input_bounds[0][1]};
for (unsigned i = 1; i < 4; ++i) {
bounds[0] = nir_fmin(b, bounds[0], input_bounds[i][0]);
bounds[1] = nir_fmax(b, bounds[1], input_bounds[i][1]);
}
nir_store_var(b, bounds_vars[0], bounds[0], 7);
nir_store_var(b, bounds_vars[1], bounds[1], 7);
}
nir_push_else(b, NULL);
nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 6)));
{ /* Instances */
nir_ssa_def *bounds[2];
for (unsigned i = 0; i < 2; ++i)
bounds[i] =
nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 64 + i * 12)),
.align_mul = 4, .align_offset = 0);
nir_store_var(b, bounds_vars[0], bounds[0], 7);
nir_store_var(b, bounds_vars[1], bounds[1], 7);
}
nir_push_else(b, NULL);
{ /* AABBs */
nir_ssa_def *bounds[2];
for (unsigned i = 0; i < 2; ++i)
bounds[i] =
nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)),
.align_mul = 4, .align_offset = 0);
nir_store_var(b, bounds_vars[0], bounds[0], 7);
nir_store_var(b, bounds_vars[1], bounds[1], 7);
}
nir_pop_if(b, NULL);
nir_pop_if(b, NULL);
nir_pop_if(b, NULL);
}
static nir_shader *
build_internal_shader(struct radv_device *dev)
{
const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_internal_shader");
b.shader->info.workgroup_size[0] = 64;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
/*
* push constants:
* i32 x 2: node dst address
* i32 x 2: scratch address
* i32: dst offset
* i32: dst scratch offset
* i32: src scratch offset
* i32: src_node_count | (fill_header << 31)
*/
nir_ssa_def *pconst0 =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
nir_ssa_def *pconst1 =
nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16);
nir_ssa_def *node_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3));
nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12));
nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0);
nir_ssa_def *dst_scratch_offset = nir_channel(&b, pconst1, 1);
nir_ssa_def *src_scratch_offset = nir_channel(&b, pconst1, 2);
nir_ssa_def *src_node_count =
nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x7FFFFFFFU));
nir_ssa_def *fill_header =
nir_ine(&b, nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x80000000U)),
nir_imm_int(&b, 0));
nir_ssa_def *global_id =
nir_iadd(&b,
nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1),
nir_imm_int(&b, b.shader->info.workgroup_size[0])),
nir_channels(&b, nir_load_local_invocation_id(&b), 1));
nir_ssa_def *src_idx = nir_imul(&b, global_id, nir_imm_int(&b, 4));
nir_ssa_def *src_count = nir_umin(&b, nir_imm_int(&b, 4), nir_isub(&b, src_node_count, src_idx));
nir_ssa_def *node_offset =
nir_iadd(&b, node_dst_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 7)));
nir_ssa_def *node_dst_addr = nir_iadd(&b, node_addr, nir_u2u64(&b, node_offset));
nir_ssa_def *src_nodes = nir_build_load_global(
&b, 4, 32,
nir_iadd(&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, src_scratch_offset,
nir_ishl(&b, global_id, nir_imm_int(&b, 4))))),
.align_mul = 4, .align_offset = 0);
nir_build_store_global(&b, src_nodes, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)),
.write_mask = 0xf, .align_mul = 4, .align_offset = 0);
nir_ssa_def *total_bounds[2] = {
nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),
nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7),
};
for (unsigned i = 0; i < 4; ++i) {
nir_variable *bounds[2] = {
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"),
nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"),
};
nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7);
nir_push_if(&b, nir_ilt(&b, nir_imm_int(&b, i), src_count));
determine_bounds(&b, node_addr, nir_channel(&b, src_nodes, i), bounds);
nir_pop_if(&b, NULL);
nir_build_store_global(&b, nir_load_var(&b, bounds[0]),
nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 24 * i)),
.write_mask = 0x7, .align_mul = 4, .align_offset = 0);
nir_build_store_global(&b, nir_load_var(&b, bounds[1]),
nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 28 + 24 * i)),
.write_mask = 0x7, .align_mul = 4, .align_offset = 0);
total_bounds[0] = nir_fmin(&b, total_bounds[0], nir_load_var(&b, bounds[0]));
total_bounds[1] = nir_fmax(&b, total_bounds[1], nir_load_var(&b, bounds[1]));
}
nir_ssa_def *node_id =
nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 5));
nir_ssa_def *dst_scratch_addr = nir_iadd(
&b, scratch_addr,
nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 2)))));
nir_build_store_global(&b, node_id, dst_scratch_addr, .write_mask = 1, .align_mul = 4,
.align_offset = 0);
nir_push_if(&b, fill_header);
nir_build_store_global(&b, node_id, node_addr, .write_mask = 1, .align_mul = 4,
.align_offset = 0);
nir_build_store_global(&b, total_bounds[0], nir_iadd(&b, node_addr, nir_imm_int64(&b, 8)),
.write_mask = 7, .align_mul = 4, .align_offset = 0);
nir_build_store_global(&b, total_bounds[1], nir_iadd(&b, node_addr, nir_imm_int64(&b, 20)),
.write_mask = 7, .align_mul = 4, .align_offset = 0);
nir_pop_if(&b, NULL);
return b.shader;
}
void
radv_device_finish_accel_struct_build_state(struct radv_device *device)
{
struct radv_meta_state *state = &device->meta_state;
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.internal_pipeline,
&state->alloc);
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline,
&state->alloc);
radv_DestroyPipelineLayout(radv_device_to_handle(device),
state->accel_struct_build.internal_p_layout, &state->alloc);
radv_DestroyPipelineLayout(radv_device_to_handle(device),
state->accel_struct_build.leaf_p_layout, &state->alloc);
}
VkResult
radv_device_init_accel_struct_build_state(struct radv_device *device)
{
VkResult result;
nir_shader *leaf_cs = build_leaf_shader(device);
nir_shader *internal_cs = build_internal_shader(device);
const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 0,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,
sizeof(struct build_primitive_constants)},
};
result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info,
&device->meta_state.alloc,
&device->meta_state.accel_struct_build.leaf_p_layout);
if (result != VK_SUCCESS)
goto fail;
VkPipelineShaderStageCreateInfo leaf_shader_stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = vk_shader_module_handle_from_nir(leaf_cs),
.pName = "main",
.pSpecializationInfo = NULL,
};
VkComputePipelineCreateInfo leaf_pipeline_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = leaf_shader_stage,
.flags = 0,
.layout = device->meta_state.accel_struct_build.leaf_p_layout,
};
result = radv_CreateComputePipelines(
radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
&leaf_pipeline_info, NULL, &device->meta_state.accel_struct_build.leaf_pipeline);
if (result != VK_SUCCESS)
goto fail;
const VkPipelineLayoutCreateInfo internal_pl_create_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 0,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0,
sizeof(struct build_internal_constants)},
};
result = radv_CreatePipelineLayout(radv_device_to_handle(device), &internal_pl_create_info,
&device->meta_state.alloc,
&device->meta_state.accel_struct_build.internal_p_layout);
if (result != VK_SUCCESS)
goto fail;
VkPipelineShaderStageCreateInfo internal_shader_stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = vk_shader_module_handle_from_nir(internal_cs),
.pName = "main",
.pSpecializationInfo = NULL,
};
VkComputePipelineCreateInfo internal_pipeline_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = internal_shader_stage,
.flags = 0,
.layout = device->meta_state.accel_struct_build.internal_p_layout,
};
result = radv_CreateComputePipelines(
radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
&internal_pipeline_info, NULL, &device->meta_state.accel_struct_build.internal_pipeline);
if (result != VK_SUCCESS)
goto fail;
return VK_SUCCESS;
fail:
radv_device_finish_accel_struct_build_state(device);
ralloc_free(internal_cs);
ralloc_free(leaf_cs);
return result;
}
struct bvh_state {
uint32_t node_offset;
uint32_t node_count;
uint32_t scratch_offset;
};
void
radv_CmdBuildAccelerationStructuresKHR(
VkCommandBuffer commandBuffer, uint32_t infoCount,
const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
struct radv_meta_saved_state saved_state;
radv_meta_save(
&saved_state, cmd_buffer,
RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state));
radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
cmd_buffer->device->meta_state.accel_struct_build.leaf_pipeline);
for (uint32_t i = 0; i < infoCount; ++i) {
RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,
pInfos[i].dstAccelerationStructure);
struct build_primitive_constants prim_consts = {
.node_dst_addr = radv_accel_struct_get_va(accel_struct),
.scratch_addr = pInfos[i].scratchData.deviceAddress,
.dst_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64) + 128,
.dst_scratch_offset = 0,
};
for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
const VkAccelerationStructureGeometryKHR *geom =
pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
prim_consts.geometry_type = geom->geometryType;
prim_consts.geometry_id = j | (geom->flags << 28);
unsigned prim_size;
switch (geom->geometryType) {
case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
prim_consts.vertex_addr =
geom->geometry.triangles.vertexData.deviceAddress +
ppBuildRangeInfos[i][j].firstVertex * geom->geometry.triangles.vertexStride +
(geom->geometry.triangles.indexType != VK_INDEX_TYPE_NONE_KHR
? ppBuildRangeInfos[i][j].primitiveOffset
: 0);
prim_consts.index_addr = geom->geometry.triangles.indexData.deviceAddress +
ppBuildRangeInfos[i][j].primitiveOffset;
prim_consts.transform_addr = geom->geometry.triangles.transformData.deviceAddress +
ppBuildRangeInfos[i][j].transformOffset;
prim_consts.vertex_stride = geom->geometry.triangles.vertexStride;
prim_consts.vertex_format = geom->geometry.triangles.vertexFormat;
prim_consts.index_format = geom->geometry.triangles.indexType;
prim_size = 64;
break;
case VK_GEOMETRY_TYPE_AABBS_KHR:
prim_consts.aabb_addr =
geom->geometry.aabbs.data.deviceAddress + ppBuildRangeInfos[i][j].primitiveOffset;
prim_consts.aabb_stride = geom->geometry.aabbs.stride;
prim_size = 64;
break;
case VK_GEOMETRY_TYPE_INSTANCES_KHR:
prim_consts.instance_data = geom->geometry.instances.data.deviceAddress;
prim_size = 128;
break;
default:
unreachable("Unknown geometryType");
}
radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), &prim_consts);
radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1);
prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount;
prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount;
}
bvh_states[i].node_offset = prim_consts.dst_offset;
bvh_states[i].node_count = prim_consts.dst_scratch_offset / 4;
}
radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
cmd_buffer->device->meta_state.accel_struct_build.internal_pipeline);
bool progress = true;
for (unsigned iter = 0; progress; ++iter) {
progress = false;
for (uint32_t i = 0; i < infoCount; ++i) {
RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,
pInfos[i].dstAccelerationStructure);
if (iter && bvh_states[i].node_count == 1)
continue;
if (!progress) {
cmd_buffer->state.flush_bits |=
RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
radv_src_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, NULL) |
radv_dst_access_flush(cmd_buffer,
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, NULL);
}
progress = true;
uint32_t dst_node_count = MAX2(1, DIV_ROUND_UP(bvh_states[i].node_count, 4));
bool final_iter = dst_node_count == 1;
uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
uint32_t dst_scratch_offset = src_scratch_offset ? 0 : bvh_states[i].node_count * 4;
uint32_t dst_node_offset = bvh_states[i].node_offset;
if (final_iter)
dst_node_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64);
const struct build_internal_constants consts = {
.node_dst_addr = radv_accel_struct_get_va(accel_struct),
.scratch_addr = pInfos[i].scratchData.deviceAddress,
.dst_offset = dst_node_offset,
.dst_scratch_offset = dst_scratch_offset,
.src_scratch_offset = src_scratch_offset,
.fill_header = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0),
};
radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
cmd_buffer->device->meta_state.accel_struct_build.internal_p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
radv_unaligned_dispatch(cmd_buffer, dst_node_count, 1, 1);
bvh_states[i].node_offset += dst_node_count * 128;
bvh_states[i].node_count = dst_node_count;
bvh_states[i].scratch_offset = dst_scratch_offset;
}
}
free(bvh_states);
radv_meta_restore(&saved_state, cmd_buffer);
}

View File

@ -474,8 +474,14 @@ radv_device_init_meta(struct radv_device *device)
if (result != VK_SUCCESS)
goto fail_fmask_expand;
result = radv_device_init_accel_struct_build_state(device);
if (result != VK_SUCCESS)
goto fail_accel_struct_build;
return VK_SUCCESS;
fail_accel_struct_build:
radv_device_finish_meta_fmask_expand_state(device);
fail_fmask_expand:
radv_device_finish_meta_resolve_fragment_state(device);
fail_resolve_fragment:
@ -507,6 +513,7 @@ fail_clear:
void
radv_device_finish_meta(struct radv_device *device)
{
radv_device_finish_accel_struct_build_state(device);
radv_device_finish_meta_clear_state(device);
radv_device_finish_meta_resolve_state(device);
radv_device_finish_meta_blit_state(device);

View File

@ -133,6 +133,9 @@ void radv_device_finish_meta_dcc_retile_state(struct radv_device *device);
void radv_device_finish_meta_copy_vrs_htile_state(struct radv_device *device);
VkResult radv_device_init_accel_struct_build_state(struct radv_device *device);
void radv_device_finish_accel_struct_build_state(struct radv_device *device);
void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer,
uint32_t flags);

View File

@ -661,6 +661,13 @@ struct radv_meta_state {
VkPipelineLayout p_layout;
VkPipeline pipeline;
} dcc_retile;
struct {
VkPipelineLayout leaf_p_layout;
VkPipeline leaf_pipeline;
VkPipelineLayout internal_p_layout;
VkPipeline internal_pipeline;
} accel_struct_build;
};
/* queue types */