mesa/src/amd/common/ac_nir.c

324 lines
12 KiB
C
Raw Normal View History

/*
* Copyright © 2016 Bas Nieuwenhuizen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "ac_nir.h"
#include "nir_builder.h"
nir_ssa_def *
ac_nir_load_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg)
{
unsigned num_components = ac_args->args[arg.arg_index].size;
if (ac_args->args[arg.arg_index].file == AC_ARG_SGPR)
return nir_load_scalar_arg_amd(b, num_components, .base = arg.arg_index);
else
return nir_load_vector_arg_amd(b, num_components, .base = arg.arg_index);
}
/**
* This function takes an I/O intrinsic like load/store_input,
* and emits a sequence that calculates the full offset of that instruction,
* including a stride to the base and component offsets.
*/
nir_ssa_def *
ac_nir_calc_io_offset(nir_builder *b,
nir_intrinsic_instr *intrin,
nir_ssa_def *base_stride,
unsigned component_stride,
ac_nir_map_io_driver_location map_io)
{
unsigned base = nir_intrinsic_base(intrin);
unsigned semantic = nir_intrinsic_io_semantics(intrin).location;
unsigned mapped_driver_location = map_io ? map_io(semantic) : base;
/* base is the driver_location, which is in slots (1 slot = 4x4 bytes) */
nir_ssa_def *base_op = nir_imul_imm(b, base_stride, mapped_driver_location);
/* offset should be interpreted in relation to the base,
* so the instruction effectively reads/writes another input/output
* when it has an offset
*/
nir_ssa_def *offset_op = nir_imul(b, base_stride, nir_ssa_for_src(b, *nir_get_io_offset_src(intrin), 1));
/* component is in bytes */
unsigned const_op = nir_intrinsic_component(intrin) * component_stride;
return nir_iadd_imm_nuw(b, nir_iadd_nuw(b, base_op, offset_op), const_op);
}
bool
ac_nir_lower_indirect_derefs(nir_shader *shader,
enum amd_gfx_level gfx_level)
{
bool progress = false;
/* Lower large variables to scratch first so that we won't bloat the
* shader by generating large if ladders for them. We later lower
* scratch to alloca's, assuming LLVM won't generate VGPR indexing.
*/
NIR_PASS(progress, shader, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
glsl_get_natural_size_align_bytes);
/* LLVM doesn't support VGPR indexing on GFX9. */
bool llvm_has_working_vgpr_indexing = gfx_level != GFX9;
/* TODO: Indirect indexing of GS inputs is unimplemented.
*
* TCS and TES load inputs directly from LDS or offchip memory, so
* indirect indexing is trivial.
*/
nir_variable_mode indirect_mask = 0;
if (shader->info.stage == MESA_SHADER_GEOMETRY ||
(shader->info.stage != MESA_SHADER_TESS_CTRL && shader->info.stage != MESA_SHADER_TESS_EVAL &&
!llvm_has_working_vgpr_indexing)) {
indirect_mask |= nir_var_shader_in;
}
if (!llvm_has_working_vgpr_indexing && shader->info.stage != MESA_SHADER_TESS_CTRL)
indirect_mask |= nir_var_shader_out;
/* TODO: We shouldn't need to do this, however LLVM isn't currently
* smart enough to handle indirects without causing excess spilling
* causing the gpu to hang.
*
* See the following thread for more details of the problem:
* https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html
*/
indirect_mask |= nir_var_function_temp;
NIR_PASS(progress, shader, nir_lower_indirect_derefs, indirect_mask, UINT32_MAX);
return progress;
}
static void
emit_streamout(nir_builder *b, const struct pipe_stream_output_info *info, unsigned stream,
nir_ssa_def *const outputs[64][4])
{
nir_ssa_def *so_vtx_count = nir_ubfe_imm(b, nir_load_streamout_config_amd(b), 16, 7);
nir_ssa_def *tid = nir_load_subgroup_invocation(b);
nir_push_if(b, nir_ilt(b, tid, so_vtx_count));
nir_ssa_def *so_write_index = nir_load_streamout_write_index_amd(b);
nir_ssa_def *so_buffers[PIPE_MAX_SO_BUFFERS];
nir_ssa_def *so_write_offset[PIPE_MAX_SO_BUFFERS];
for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
uint16_t stride = info->stride[i];
if (!stride)
continue;
so_buffers[i] = nir_load_streamout_buffer_amd(b, i);
nir_ssa_def *offset = nir_load_streamout_offset_amd(b, i);
offset = nir_iadd(b, nir_imul_imm(b, nir_iadd(b, so_write_index, tid), stride * 4),
nir_imul_imm(b, offset, 4));
so_write_offset[i] = offset;
}
nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
for (unsigned i = 0; i < info->num_outputs; i++) {
const struct pipe_stream_output *output = &info->output[i];
if (stream != output->stream)
continue;
nir_ssa_def *vec[4] = {undef, undef, undef, undef};
uint8_t mask = 0;
for (unsigned j = 0; j < output->num_components; j++) {
if (outputs[output->register_index][output->start_component + j]) {
vec[j] = outputs[output->register_index][output->start_component + j];
mask |= 1 << j;
}
}
if (!mask)
continue;
unsigned buffer = output->output_buffer;
nir_ssa_def *data = nir_vec(b, vec, output->num_components);
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_store_buffer_amd(b, data, so_buffers[buffer], so_write_offset[buffer], zero, zero,
.base = output->dst_offset * 4, .slc_amd = true, .write_mask = mask,
.access = ACCESS_COHERENT);
}
nir_pop_if(b, NULL);
}
nir_shader *
ac_nir_create_gs_copy_shader(const nir_shader *gs_nir,
const struct pipe_stream_output_info *so_info, size_t num_outputs,
const uint8_t *output_usage_mask, const uint8_t *output_streams,
const uint8_t *output_semantics,
const uint8_t num_stream_output_components[4])
{
assert(num_outputs <= 64);
nir_builder b = nir_builder_init_simple_shader(
MESA_SHADER_VERTEX, gs_nir->options, "gs_copy");
nir_foreach_shader_out_variable(var, gs_nir)
nir_shader_add_variable(b.shader, nir_variable_clone(var, b.shader));
nir_ssa_def *gsvs_ring = nir_load_ring_gsvs_amd(&b);
nir_ssa_def *stream_id = NULL;
if (so_info->num_outputs)
stream_id = nir_ubfe_imm(&b, nir_load_streamout_config_amd(&b), 24, 2);
nir_ssa_def *vtx_offset = nir_imul_imm(&b, nir_load_vertex_id_zero_base(&b), 4);
nir_ssa_def *zero = nir_imm_zero(&b, 1, 32);
for (unsigned stream = 0; stream < 4; stream++) {
if (stream > 0 && (!stream_id || !num_stream_output_components[stream]))
continue;
if (stream_id)
nir_push_if(&b, nir_ieq_imm(&b, stream_id, stream));
uint32_t offset = 0;
uint64_t output_mask = 0;
nir_ssa_def *outputs[64][4] = {{0}};
for (unsigned i = 0; i < num_outputs; i++) {
unsigned mask = output_usage_mask[i];
if (!mask)
continue;
u_foreach_bit (j, mask) {
if (((output_streams[i] >> (j * 2)) & 0x3) != stream)
continue;
outputs[i][j] = nir_load_buffer_amd(&b, 1, 32, gsvs_ring, vtx_offset, zero, zero,
.base = offset, .is_swizzled = false,
.slc_amd = true, .access = ACCESS_COHERENT);
offset += gs_nir->info.gs.vertices_out * 16 * 4;
}
output_mask |= 1ull << i;
}
if (stream_id)
emit_streamout(&b, so_info, stream, outputs);
if (stream == 0) {
u_foreach_bit64 (i, output_mask) {
gl_varying_slot location = output_semantics ? output_semantics[i] : i;
for (unsigned j = 0; j < 4; j++) {
if (outputs[i][j]) {
nir_store_output(&b, outputs[i][j], zero,
.base = i,
.component = j,
.write_mask = 1,
.src_type = nir_type_uint32,
.io_semantics = {.location = location, .num_slots = 1});
}
}
}
nir_export_vertex_amd(&b);
}
if (stream_id)
nir_push_else(&b, NULL);
}
b.shader->info.clip_distance_array_size = gs_nir->info.clip_distance_array_size;
b.shader->info.cull_distance_array_size = gs_nir->info.cull_distance_array_size;
return b.shader;
}
static void
gather_outputs(nir_builder *b, nir_function_impl *impl, nir_ssa_def *outputs[64][4])
{
/* Assume:
* - the shader used nir_lower_io_to_temporaries
* - 64-bit outputs are lowered
* - no indirect indexing is present
*/
nir_foreach_block(block, impl) {
nir_foreach_instr (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_store_output)
continue;
assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
unsigned slot = nir_intrinsic_base(intrin);
u_foreach_bit (i, nir_intrinsic_write_mask(intrin)) {
unsigned comp = nir_intrinsic_component(intrin) + i;
outputs[slot][comp] = nir_channel(b, intrin->src[0].ssa, i);
}
}
}
}
void
ac_nir_lower_legacy_vs(nir_shader *nir, int primitive_id_location,
const struct pipe_stream_output_info *so_info)
{
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_metadata preserved = nir_metadata_block_index | nir_metadata_dominance;
nir_builder b;
nir_builder_init(&b, impl);
b.cursor = nir_after_cf_list(&impl->body);
if (primitive_id_location >= 0) {
/* When the primitive ID is read by FS, we must ensure that it's exported by the previous
* vertex stage because it's implicit for VS or TES (but required by the Vulkan spec for GS
* or MS).
*/
nir_variable *var = nir_variable_create(nir, nir_var_shader_out, glsl_int_type(), NULL);
var->data.location = VARYING_SLOT_PRIMITIVE_ID;
var->data.interpolation = INTERP_MODE_NONE;
var->data.driver_location = primitive_id_location;
nir_store_output(
&b, nir_load_primitive_id(&b), nir_imm_int(&b, 0), .base = primitive_id_location,
.src_type = nir_type_int32,
.io_semantics = (nir_io_semantics){.location = var->data.location, .num_slots = 1});
/* Update outputs_written to reflect that the pass added a new output. */
nir->info.outputs_written |= BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_ID);
}
if (so_info && so_info->num_outputs) {
/* 26.1. Transform Feedback of Vulkan 1.3.229 spec:
* > The size of each component of an output variable must be at least 32-bits.
* We lower 64-bit outputs.
*/
nir_ssa_def *outputs[64][4] = {{0}};
gather_outputs(&b, impl, outputs);
emit_streamout(&b, so_info, 0, outputs);
preserved = nir_metadata_none;
}
nir_export_vertex_amd(&b);
nir_metadata_preserve(impl, preserved);
}