2022-05-26 12:20:36 +01:00
|
|
|
/*
|
|
|
|
* Copyright © 2022 Valve Corporation
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
* Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Timur Kristóf
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "nir.h"
|
|
|
|
#include "nir_builder.h"
|
|
|
|
#include "util/u_math.h"
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
uint32_t task_count_shared_addr;
|
|
|
|
} lower_task_nv_state;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
/* If true, lower all task_payload I/O to use shared memory. */
|
|
|
|
bool payload_in_shared;
|
|
|
|
/* Shared memory address where task_payload will be located. */
|
|
|
|
uint32_t payload_shared_addr;
|
|
|
|
} lower_task_state;
|
|
|
|
|
|
|
|
static bool
|
|
|
|
lower_nv_task_output(nir_builder *b,
|
|
|
|
nir_instr *instr,
|
|
|
|
void *state)
|
|
|
|
{
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
lower_task_nv_state *s = (lower_task_nv_state *) state;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
case nir_intrinsic_load_output: {
|
|
|
|
b->cursor = nir_after_instr(instr);
|
|
|
|
nir_ssa_def *load =
|
|
|
|
nir_load_shared(b, 1, 32, nir_imm_int(b, 0),
|
|
|
|
.base = s->task_count_shared_addr);
|
|
|
|
nir_ssa_def_rewrite_uses(&intrin->dest.ssa, load);
|
|
|
|
nir_instr_remove(instr);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
case nir_intrinsic_store_output: {
|
|
|
|
b->cursor = nir_after_instr(instr);
|
|
|
|
nir_ssa_def *store_val = intrin->src[0].ssa;
|
|
|
|
nir_store_shared(b, store_val, nir_imm_int(b, 0),
|
|
|
|
.base = s->task_count_shared_addr);
|
|
|
|
nir_instr_remove(instr);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
append_launch_mesh_workgroups_to_nv_task(nir_builder *b,
|
|
|
|
lower_task_nv_state *s)
|
|
|
|
{
|
|
|
|
/* At the beginning of the shader, write 0 to the task count.
|
|
|
|
* This ensures that 0 mesh workgroups are launched when the
|
|
|
|
* shader doesn't write the TASK_COUNT output.
|
|
|
|
*/
|
|
|
|
b->cursor = nir_before_cf_list(&b->impl->body);
|
|
|
|
nir_ssa_def *zero = nir_imm_int(b, 0);
|
|
|
|
nir_store_shared(b, zero, zero, .base = s->task_count_shared_addr);
|
|
|
|
|
|
|
|
/* At the end of the shader, read the task count from shared memory
|
|
|
|
* and emit launch_mesh_workgroups.
|
|
|
|
*/
|
|
|
|
b->cursor = nir_after_cf_list(&b->impl->body);
|
|
|
|
nir_ssa_def *task_count =
|
|
|
|
nir_load_shared(b, 1, 32, zero, .base = s->task_count_shared_addr);
|
|
|
|
|
|
|
|
/* NV_mesh_shader doesn't offer to choose which task_payload variable
|
|
|
|
* should be passed to mesh shaders, we just pass all.
|
|
|
|
*/
|
|
|
|
uint32_t range = b->shader->info.task_payload_size;
|
|
|
|
|
|
|
|
nir_ssa_def *one = nir_imm_int(b, 1);
|
|
|
|
nir_ssa_def *dispatch_3d = nir_vec3(b, task_count, one, one);
|
|
|
|
nir_launch_mesh_workgroups(b, dispatch_3d, .base = 0, .range = range);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* For NV_mesh_shader:
|
|
|
|
* Task shaders only have 1 output, TASK_COUNT which is a 32-bit
|
|
|
|
* unsigned int that contains the 1-dimensional mesh dispatch size.
|
|
|
|
* This output should behave like a shared variable.
|
|
|
|
*
|
|
|
|
* We lower this output to a shared variable and then we emit
|
|
|
|
* the new launch_mesh_workgroups intrinsic at the end of the shader.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
nir_lower_nv_task_count(nir_shader *shader)
|
|
|
|
{
|
|
|
|
lower_task_nv_state state = {
|
|
|
|
.task_count_shared_addr = ALIGN(shader->info.shared_size, 4),
|
|
|
|
};
|
|
|
|
|
|
|
|
shader->info.shared_size += 4;
|
|
|
|
nir_shader_instructions_pass(shader, lower_nv_task_output,
|
|
|
|
nir_metadata_none, &state);
|
|
|
|
|
|
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
nir_builder builder;
|
|
|
|
nir_builder_init(&builder, impl);
|
|
|
|
|
|
|
|
append_launch_mesh_workgroups_to_nv_task(&builder, &state);
|
|
|
|
nir_metadata_preserve(impl, nir_metadata_none);
|
|
|
|
}
|
|
|
|
|
|
|
|
static nir_intrinsic_op
|
|
|
|
shared_opcode_for_task_payload(nir_intrinsic_op task_payload_op)
|
|
|
|
{
|
|
|
|
switch (task_payload_op) {
|
|
|
|
#define OP(O) case nir_intrinsic_task_payload_##O: return nir_intrinsic_shared_##O;
|
|
|
|
OP(atomic_exchange)
|
|
|
|
OP(atomic_comp_swap)
|
|
|
|
OP(atomic_add)
|
|
|
|
OP(atomic_imin)
|
|
|
|
OP(atomic_umin)
|
|
|
|
OP(atomic_imax)
|
|
|
|
OP(atomic_umax)
|
|
|
|
OP(atomic_and)
|
|
|
|
OP(atomic_or)
|
|
|
|
OP(atomic_xor)
|
|
|
|
OP(atomic_fadd)
|
|
|
|
OP(atomic_fmin)
|
|
|
|
OP(atomic_fmax)
|
|
|
|
OP(atomic_fcomp_swap)
|
|
|
|
#undef OP
|
|
|
|
case nir_intrinsic_load_task_payload:
|
|
|
|
return nir_intrinsic_load_shared;
|
|
|
|
case nir_intrinsic_store_task_payload:
|
|
|
|
return nir_intrinsic_store_shared;
|
|
|
|
default:
|
|
|
|
unreachable("Invalid task payload atomic");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
lower_task_payload_to_shared(nir_builder *b,
|
|
|
|
nir_intrinsic_instr *intrin,
|
|
|
|
lower_task_state *s)
|
|
|
|
{
|
|
|
|
/* This assumes that shared and task_payload intrinsics
|
|
|
|
* have the same number of sources and same indices.
|
|
|
|
*/
|
|
|
|
unsigned base = nir_intrinsic_base(intrin);
|
|
|
|
intrin->intrinsic = shared_opcode_for_task_payload(intrin->intrinsic);
|
|
|
|
nir_intrinsic_set_base(intrin, base + s->payload_shared_addr);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
emit_shared_to_payload_copy(nir_builder *b,
|
|
|
|
uint32_t payload_addr,
|
|
|
|
uint32_t payload_size,
|
|
|
|
lower_task_state *s)
|
|
|
|
{
|
|
|
|
const unsigned invocations = b->shader->info.workgroup_size[0] *
|
|
|
|
b->shader->info.workgroup_size[1] *
|
|
|
|
b->shader->info.workgroup_size[2];
|
|
|
|
const unsigned bytes_per_copy = 16;
|
|
|
|
const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy);
|
|
|
|
const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations);
|
|
|
|
const unsigned base_shared_addr = s->payload_shared_addr + payload_addr;
|
|
|
|
|
|
|
|
nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
|
|
|
|
nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy);
|
|
|
|
|
|
|
|
/* Wait for all previous shared stores to finish.
|
|
|
|
* This is necessary because we placed the payload in shared memory.
|
|
|
|
*/
|
|
|
|
nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP,
|
|
|
|
.memory_scope = NIR_SCOPE_WORKGROUP,
|
|
|
|
.memory_semantics = NIR_MEMORY_ACQ_REL,
|
|
|
|
.memory_modes = nir_var_mem_shared);
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < copies_per_invocation; ++i) {
|
|
|
|
unsigned const_off = bytes_per_copy * invocations * i;
|
|
|
|
|
|
|
|
/* Read from shared memory. */
|
|
|
|
nir_ssa_def *copy =
|
|
|
|
nir_load_shared(b, 4, 32, addr, .align_mul = 16,
|
|
|
|
.base = base_shared_addr + const_off);
|
|
|
|
|
|
|
|
/* Write to task payload memory. */
|
|
|
|
nir_store_task_payload(b, copy, addr, .base = const_off);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
lower_task_launch_mesh_workgroups(nir_builder *b,
|
|
|
|
nir_intrinsic_instr *intrin,
|
|
|
|
lower_task_state *s)
|
|
|
|
{
|
|
|
|
if (s->payload_in_shared) {
|
|
|
|
/* Copy the payload from shared memory.
|
|
|
|
* Because launch_mesh_workgroups may only occur in
|
|
|
|
* workgroup-uniform control flow, here we assume that
|
|
|
|
* all invocations in the workgroup are active and therefore
|
|
|
|
* they can all participate in the copy.
|
|
|
|
*
|
|
|
|
* TODO: Skip the copy when the mesh dispatch size is (0, 0, 0).
|
|
|
|
* This is problematic because the dispatch size can be divergent,
|
|
|
|
* and may differ accross subgroups.
|
|
|
|
*/
|
|
|
|
|
|
|
|
uint32_t payload_addr = nir_intrinsic_base(intrin);
|
|
|
|
uint32_t payload_size = nir_intrinsic_range(intrin);
|
|
|
|
|
|
|
|
b->cursor = nir_before_instr(&intrin->instr);
|
|
|
|
emit_shared_to_payload_copy(b, payload_addr, payload_size, s);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The launch_mesh_workgroups intrinsic is a terminating instruction,
|
|
|
|
* so let's delete everything after it.
|
|
|
|
*/
|
|
|
|
b->cursor = nir_after_instr(&intrin->instr);
|
|
|
|
nir_block *current_block = nir_cursor_current_block(b->cursor);
|
|
|
|
|
|
|
|
/* Delete following instructions in the current block. */
|
|
|
|
nir_foreach_instr_reverse_safe(instr, current_block) {
|
|
|
|
if (instr == &intrin->instr)
|
|
|
|
break;
|
|
|
|
nir_instr_remove(instr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Delete following CF at the same level. */
|
|
|
|
b->cursor = nir_after_instr(&intrin->instr);
|
|
|
|
nir_cf_list extracted;
|
|
|
|
nir_cf_node *end_node = ¤t_block->cf_node;
|
|
|
|
while (!nir_cf_node_is_last(end_node))
|
|
|
|
end_node = nir_cf_node_next(end_node);
|
|
|
|
nir_cf_extract(&extracted, b->cursor, nir_after_cf_node(end_node));
|
|
|
|
nir_cf_delete(&extracted);
|
|
|
|
|
|
|
|
/* Terminate the task shader. */
|
|
|
|
b->cursor = nir_after_instr(&intrin->instr);
|
|
|
|
nir_jump(b, nir_jump_return);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
lower_task_intrin(nir_builder *b,
|
|
|
|
nir_instr *instr,
|
|
|
|
void *state)
|
|
|
|
{
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
lower_task_state *s = (lower_task_state *) state;
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
case nir_intrinsic_task_payload_atomic_add:
|
|
|
|
case nir_intrinsic_task_payload_atomic_imin:
|
|
|
|
case nir_intrinsic_task_payload_atomic_umin:
|
|
|
|
case nir_intrinsic_task_payload_atomic_imax:
|
|
|
|
case nir_intrinsic_task_payload_atomic_umax:
|
|
|
|
case nir_intrinsic_task_payload_atomic_and:
|
|
|
|
case nir_intrinsic_task_payload_atomic_or:
|
|
|
|
case nir_intrinsic_task_payload_atomic_xor:
|
|
|
|
case nir_intrinsic_task_payload_atomic_exchange:
|
|
|
|
case nir_intrinsic_task_payload_atomic_comp_swap:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fadd:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fmin:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fmax:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fcomp_swap:
|
|
|
|
case nir_intrinsic_store_task_payload:
|
|
|
|
case nir_intrinsic_load_task_payload:
|
|
|
|
if (s->payload_in_shared)
|
|
|
|
return lower_task_payload_to_shared(b, intrin, s);
|
2022-05-27 19:33:52 +01:00
|
|
|
return false;
|
2022-05-26 12:20:36 +01:00
|
|
|
case nir_intrinsic_launch_mesh_workgroups:
|
|
|
|
return lower_task_launch_mesh_workgroups(b, intrin, s);
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
uses_task_payload_atomics(nir_shader *shader)
|
|
|
|
{
|
|
|
|
nir_foreach_function(func, shader) {
|
|
|
|
if (!func->impl)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nir_foreach_block(block, func->impl) {
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
case nir_intrinsic_task_payload_atomic_add:
|
|
|
|
case nir_intrinsic_task_payload_atomic_imin:
|
|
|
|
case nir_intrinsic_task_payload_atomic_umin:
|
|
|
|
case nir_intrinsic_task_payload_atomic_imax:
|
|
|
|
case nir_intrinsic_task_payload_atomic_umax:
|
|
|
|
case nir_intrinsic_task_payload_atomic_and:
|
|
|
|
case nir_intrinsic_task_payload_atomic_or:
|
|
|
|
case nir_intrinsic_task_payload_atomic_xor:
|
|
|
|
case nir_intrinsic_task_payload_atomic_exchange:
|
|
|
|
case nir_intrinsic_task_payload_atomic_comp_swap:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fadd:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fmin:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fmax:
|
|
|
|
case nir_intrinsic_task_payload_atomic_fcomp_swap:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Common Task Shader lowering to make the job of the backends easier.
|
|
|
|
*
|
|
|
|
* - Lowers NV_mesh_shader TASK_COUNT output to launch_mesh_workgroups.
|
|
|
|
* - Removes all code after launch_mesh_workgroups, enforcing the
|
|
|
|
* fact that it's a terminating instruction.
|
|
|
|
* - Ensures that task shaders always have at least one
|
|
|
|
* launch_mesh_workgroups instruction, so the backend doesn't
|
|
|
|
* need to implement a special case when the shader doesn't have it.
|
|
|
|
* - Optionally, implements task_payload using shared memory when
|
|
|
|
* task_payload atomics are used.
|
|
|
|
* This is useful when the backend is otherwise not capable of
|
|
|
|
* handling the same atomic features as it can for shared memory.
|
|
|
|
* If this is used, the backend only has to implement the basic
|
|
|
|
* load/store operations for task_payload.
|
|
|
|
*
|
|
|
|
* Note, this pass operates on lowered explicit I/O intrinsics, so
|
|
|
|
* it should be called after nir_lower_io + nir_lower_explicit_io.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
nir_lower_task_shader(nir_shader *shader,
|
|
|
|
nir_lower_task_shader_options options)
|
|
|
|
{
|
|
|
|
if (shader->info.stage != MESA_SHADER_TASK)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
nir_builder builder;
|
|
|
|
nir_builder_init(&builder, impl);
|
|
|
|
|
|
|
|
if (shader->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_TASK_COUNT)) {
|
|
|
|
/* NV_mesh_shader:
|
|
|
|
* If the shader writes TASK_COUNT, lower that to emit
|
|
|
|
* the new launch_mesh_workgroups intrinsic instead.
|
|
|
|
*/
|
|
|
|
nir_lower_nv_task_count(shader);
|
|
|
|
} else {
|
|
|
|
/* To make sure that task shaders always have a code path that
|
|
|
|
* executes a launch_mesh_workgroups, let's add one at the end.
|
|
|
|
* If the shader already had a launch_mesh_workgroups by any chance,
|
|
|
|
* this will be removed.
|
|
|
|
*/
|
|
|
|
builder.cursor = nir_after_cf_list(&builder.impl->body);
|
|
|
|
nir_launch_mesh_workgroups(&builder, nir_imm_zero(&builder, 3, 32));
|
|
|
|
}
|
|
|
|
|
|
|
|
bool payload_in_shared = options.payload_to_shared_for_atomics &&
|
|
|
|
uses_task_payload_atomics(shader);
|
|
|
|
|
|
|
|
lower_task_state state = {
|
|
|
|
.payload_shared_addr = ALIGN(shader->info.shared_size, 16),
|
|
|
|
.payload_in_shared = payload_in_shared,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (payload_in_shared)
|
|
|
|
shader->info.shared_size =
|
|
|
|
state.payload_shared_addr + shader->info.task_payload_size;
|
|
|
|
|
|
|
|
nir_shader_instructions_pass(shader, lower_task_intrin,
|
|
|
|
nir_metadata_none, &state);
|
|
|
|
|
|
|
|
/* Delete all code that potentially can't be reached due to
|
|
|
|
* launch_mesh_workgroups being a terminating instruction.
|
|
|
|
*/
|
|
|
|
nir_lower_returns(shader);
|
|
|
|
bool progress;
|
|
|
|
do {
|
|
|
|
progress = false;
|
|
|
|
NIR_PASS(progress, shader, nir_opt_dead_cf);
|
|
|
|
NIR_PASS(progress, shader, nir_opt_dce);
|
|
|
|
} while (progress);
|
|
|
|
return true;
|
|
|
|
}
|