ac/nir: use shorter builder names

This makes a lot of lines shorter.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14455>
This commit is contained in:
Rhys Perry 2022-01-06 19:07:37 +00:00 committed by Marge Bot
parent 533118413b
commit 9e171b6d49
3 changed files with 165 additions and 165 deletions

View File

@ -70,10 +70,10 @@ cull_face(nir_builder *b, nir_ssa_def *pos[3][4], const position_w_info *w_info)
nir_ssa_def *front_facing_cw = nir_flt(b, det, nir_imm_float(b, 0.0f));
nir_ssa_def *front_facing_ccw = nir_flt(b, nir_imm_float(b, 0.0f), det);
nir_ssa_def *ccw = nir_build_load_cull_ccw_amd(b);
nir_ssa_def *ccw = nir_load_cull_ccw_amd(b);
nir_ssa_def *front_facing = nir_bcsel(b, ccw, front_facing_ccw, front_facing_cw);
nir_ssa_def *cull_front = nir_build_load_cull_front_face_enabled_amd(b);
nir_ssa_def *cull_back = nir_build_load_cull_back_face_enabled_amd(b);
nir_ssa_def *cull_front = nir_load_cull_front_face_enabled_amd(b);
nir_ssa_def *cull_back = nir_load_cull_back_face_enabled_amd(b);
nir_ssa_def *face_culled = nir_bcsel(b, front_facing, cull_front, cull_back);
@ -100,8 +100,8 @@ cull_bbox(nir_builder *b, nir_ssa_def *pos[3][4], nir_ssa_def *accepted, const p
bbox_max[chan] = nir_fmax(b, pos[0][chan], nir_fmax(b, pos[1][chan], pos[2][chan]));
}
nir_ssa_def *vp_scale[2] = { nir_build_load_viewport_x_scale(b), nir_build_load_viewport_y_scale(b), };
nir_ssa_def *vp_translate[2] = { nir_build_load_viewport_x_offset(b), nir_build_load_viewport_y_offset(b), };
nir_ssa_def *vp_scale[2] = { nir_load_viewport_x_scale(b), nir_load_viewport_y_scale(b), };
nir_ssa_def *vp_translate[2] = { nir_load_viewport_x_offset(b), nir_load_viewport_y_offset(b), };
nir_ssa_def *prim_outside_view = nir_imm_false(b);
/* Frustrum culling - eliminate triangles that are fully outside the view. */
@ -114,9 +114,9 @@ cull_bbox(nir_builder *b, nir_ssa_def *pos[3][4], nir_ssa_def *accepted, const p
nir_ssa_def *prim_is_small_else = nir_imm_false(b);
/* Small primitive filter - eliminate triangles that are too small to affect a sample. */
nir_if *if_cull_small_prims = nir_push_if(b, nir_build_load_cull_small_primitives_enabled_amd(b));
nir_if *if_cull_small_prims = nir_push_if(b, nir_load_cull_small_primitives_enabled_amd(b));
{
nir_ssa_def *small_prim_precision = nir_build_load_cull_small_prim_precision_amd(b);
nir_ssa_def *small_prim_precision = nir_load_cull_small_prim_precision_amd(b);
prim_is_small = nir_imm_false(b);
for (unsigned chan = 0; chan < 2; ++chan) {

View File

@ -189,7 +189,7 @@ summarize_repack(nir_builder *b, nir_ssa_def *packed_counts, unsigned num_lds_dw
nir_ssa_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int(b, 0x01010101), shift), shift);
/* Broadcast the packed data we read from LDS (to the first 16 lanes, but we only care up to num_waves). */
nir_ssa_def *packed = nir_build_lane_permute_16_amd(b, packed_counts, nir_imm_int(b, 0), nir_imm_int(b, 0));
nir_ssa_def *packed = nir_lane_permute_16_amd(b, packed_counts, nir_imm_int(b, 0), nir_imm_int(b, 0));
/* Horizontally add the packed bytes. */
if (use_dot) {
@ -202,8 +202,8 @@ summarize_repack(nir_builder *b, nir_ssa_def *packed_counts, unsigned num_lds_dw
nir_ssa_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int64(b, 0x0101010101010101), shift), shift);
/* Broadcast the packed data we read from LDS (to the first 16 lanes, but we only care up to num_waves). */
nir_ssa_def *packed_dw0 = nir_build_lane_permute_16_amd(b, nir_unpack_64_2x32_split_x(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
nir_ssa_def *packed_dw1 = nir_build_lane_permute_16_amd(b, nir_unpack_64_2x32_split_y(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
nir_ssa_def *packed_dw0 = nir_lane_permute_16_amd(b, nir_unpack_64_2x32_split_x(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
nir_ssa_def *packed_dw1 = nir_lane_permute_16_amd(b, nir_unpack_64_2x32_split_y(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
/* Horizontally add the packed bytes. */
if (use_dot) {
@ -238,14 +238,14 @@ repack_invocations_in_workgroup(nir_builder *b, nir_ssa_def *input_bool,
* Implemented by a scalar instruction that simply counts the number of bits set in a 32/64-bit mask.
*/
nir_ssa_def *input_mask = nir_build_ballot(b, 1, wave_size, input_bool);
nir_ssa_def *input_mask = nir_ballot(b, 1, wave_size, input_bool);
nir_ssa_def *surviving_invocations_in_current_wave = nir_bit_count(b, input_mask);
/* If we know at compile time that the workgroup has only 1 wave, no further steps are necessary. */
if (max_num_waves == 1) {
wg_repack_result r = {
.num_repacked_invocations = surviving_invocations_in_current_wave,
.repacked_invocation_index = nir_build_mbcnt_amd(b, input_mask, nir_imm_int(b, 0)),
.repacked_invocation_index = nir_mbcnt_amd(b, input_mask, nir_imm_int(b, 0)),
};
return r;
}
@ -263,16 +263,16 @@ repack_invocations_in_workgroup(nir_builder *b, nir_ssa_def *input_bool,
const unsigned num_lds_dwords = DIV_ROUND_UP(max_num_waves, 4);
assert(num_lds_dwords <= 2);
nir_ssa_def *wave_id = nir_build_load_subgroup_id(b);
nir_ssa_def *wave_id = nir_load_subgroup_id(b);
nir_ssa_def *dont_care = nir_ssa_undef(b, 1, num_lds_dwords * 32);
nir_if *if_first_lane = nir_push_if(b, nir_build_elect(b, 1));
nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
nir_build_store_shared(b, nir_u2u8(b, surviving_invocations_in_current_wave), wave_id, .base = lds_addr_base);
nir_store_shared(b, nir_u2u8(b, surviving_invocations_in_current_wave), wave_id, .base = lds_addr_base);
nir_scoped_barrier(b, .execution_scope=NIR_SCOPE_WORKGROUP, .memory_scope=NIR_SCOPE_WORKGROUP,
.memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
nir_ssa_def *packed_counts = nir_build_load_shared(b, 1, num_lds_dwords * 32, nir_imm_int(b, 0), .base = lds_addr_base, .align_mul = 8u);
nir_ssa_def *packed_counts = nir_load_shared(b, 1, num_lds_dwords * 32, nir_imm_int(b, 0), .base = lds_addr_base, .align_mul = 8u);
nir_pop_if(b, if_first_lane);
@ -293,12 +293,12 @@ repack_invocations_in_workgroup(nir_builder *b, nir_ssa_def *input_bool,
* This is the total number of surviving invocations in the workgroup.
*/
nir_ssa_def *num_waves = nir_build_load_num_subgroups(b);
nir_ssa_def *num_waves = nir_load_num_subgroups(b);
nir_ssa_def *sum = summarize_repack(b, packed_counts, num_lds_dwords);
nir_ssa_def *wg_repacked_index_base = nir_build_read_invocation(b, sum, wave_id);
nir_ssa_def *wg_num_repacked_invocations = nir_build_read_invocation(b, sum, num_waves);
nir_ssa_def *wg_repacked_index = nir_build_mbcnt_amd(b, input_mask, wg_repacked_index_base);
nir_ssa_def *wg_repacked_index_base = nir_read_invocation(b, sum, wave_id);
nir_ssa_def *wg_num_repacked_invocations = nir_read_invocation(b, sum, num_waves);
nir_ssa_def *wg_repacked_index = nir_mbcnt_amd(b, input_mask, wg_repacked_index_base);
wg_repack_result r = {
.num_repacked_invocations = wg_num_repacked_invocations,
@ -320,7 +320,7 @@ emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
bool use_edgeflags)
{
nir_ssa_def *arg = use_edgeflags
? nir_build_load_initial_edgeflags_amd(b)
? nir_load_initial_edgeflags_amd(b)
: nir_imm_int(b, 0);
for (unsigned i = 0; i < num_vertices_per_primitives; ++i) {
@ -341,7 +341,7 @@ emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
static nir_ssa_def *
ngg_input_primitive_vertex_index(nir_builder *b, unsigned vertex)
{
return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u),
return nir_ubfe(b, nir_load_gs_vertex_offset_amd(b, .base = vertex / 2u),
nir_imm_int(b, (vertex & 1u) * 16u), nir_imm_int(b, 16u));
}
@ -350,7 +350,7 @@ emit_ngg_nogs_prim_exp_arg(nir_builder *b, lower_ngg_nogs_state *st)
{
if (st->passthrough) {
assert(!st->export_prim_id || b->shader->info.stage != MESA_SHADER_VERTEX);
return nir_build_load_packed_passthrough_primitive_amd(b);
return nir_load_packed_passthrough_primitive_amd(b);
} else {
nir_ssa_def *vtx_idx[3] = {0};
@ -371,7 +371,7 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def
{
nir_ssa_def *gs_thread = st->gs_accepted_var
? nir_load_var(b, st->gs_accepted_var)
: nir_build_has_input_primitive_amd(b);
: nir_has_input_primitive_amd(b);
nir_if *if_gs_thread = nir_push_if(b, gs_thread);
{
@ -380,14 +380,14 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def
if (st->export_prim_id && b->shader->info.stage == MESA_SHADER_VERTEX) {
/* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
nir_ssa_def *prim_id = nir_build_load_primitive_id(b);
nir_ssa_def *prim_id = nir_load_primitive_id(b);
nir_ssa_def *provoking_vtx_idx = ngg_input_primitive_vertex_index(b, st->provoking_vtx_idx);
nir_ssa_def *addr = pervertex_lds_addr(b, provoking_vtx_idx, 4u);
nir_build_store_shared(b, prim_id, addr);
nir_store_shared(b, prim_id, addr);
}
nir_build_export_primitive_amd(b, arg);
nir_export_primitive_amd(b, arg);
}
nir_pop_if(b, if_gs_thread);
}
@ -403,14 +403,14 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b)
.memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_shared);
/* LDS address where the primitive ID is stored */
nir_ssa_def *thread_id_in_threadgroup = nir_build_load_local_invocation_index(b);
nir_ssa_def *thread_id_in_threadgroup = nir_load_local_invocation_index(b);
nir_ssa_def *addr = pervertex_lds_addr(b, thread_id_in_threadgroup, 4u);
/* Load primitive ID from LDS */
prim_id = nir_build_load_shared(b, 1, 32, addr);
prim_id = nir_load_shared(b, 1, 32, addr);
} else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
/* Just use tess eval primitive ID, which is the same as the patch ID. */
prim_id = nir_build_load_primitive_id(b);
prim_id = nir_load_primitive_id(b);
}
nir_io_semantics io_sem = {
@ -418,9 +418,9 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b)
.num_slots = 1,
};
nir_build_store_output(b, prim_id, nir_imm_zero(b, 1, 32),
.base = io_sem.location,
.src_type = nir_type_uint32, .io_semantics = io_sem);
nir_store_output(b, prim_id, nir_imm_zero(b, 1, 32),
.base = io_sem.location,
.src_type = nir_type_uint32, .io_semantics = io_sem);
}
static bool
@ -715,16 +715,16 @@ compact_vertices_after_culling(nir_builder *b,
nir_ssa_def *exporter_addr = pervertex_lds_addr(b, es_exporter_tid, pervertex_lds_bytes);
/* Store the exporter thread's index to the LDS space of the current thread so GS threads can load it */
nir_build_store_shared(b, nir_u2u8(b, es_exporter_tid), es_vertex_lds_addr, .base = lds_es_exporter_tid);
nir_store_shared(b, nir_u2u8(b, es_exporter_tid), es_vertex_lds_addr, .base = lds_es_exporter_tid);
/* Store the current thread's position output to the exporter thread's LDS space */
nir_ssa_def *pos = nir_load_var(b, position_value_var);
nir_build_store_shared(b, pos, exporter_addr, .base = lds_es_pos_x);
nir_store_shared(b, pos, exporter_addr, .base = lds_es_pos_x);
/* Store the current thread's repackable arguments to the exporter thread's LDS space */
for (unsigned i = 0; i < max_exported_args; ++i) {
nir_ssa_def *arg_val = nir_load_var(b, repacked_arg_vars[i]);
nir_intrinsic_instr *store = nir_build_store_shared(b, arg_val, exporter_addr, .base = lds_es_arg_0 + 4u * i);
nir_intrinsic_instr *store = nir_store_shared(b, arg_val, exporter_addr, .base = lds_es_arg_0 + 4u * i);
nogs_state->compact_arg_stores[i] = &store->instr;
}
@ -742,12 +742,12 @@ compact_vertices_after_culling(nir_builder *b,
nir_if *if_packed_es_thread = nir_push_if(b, es_survived);
{
/* Read position from the current ES thread's LDS space (written by the exported vertex's ES thread) */
nir_ssa_def *exported_pos = nir_build_load_shared(b, 4, 32, es_vertex_lds_addr, .base = lds_es_pos_x);
nir_ssa_def *exported_pos = nir_load_shared(b, 4, 32, es_vertex_lds_addr, .base = lds_es_pos_x);
nir_store_var(b, position_value_var, exported_pos, 0xfu);
/* Read the repacked arguments */
for (unsigned i = 0; i < max_exported_args; ++i) {
nir_ssa_def *arg_val = nir_build_load_shared(b, 1, 32, es_vertex_lds_addr, .base = lds_es_arg_0 + 4u * i);
nir_ssa_def *arg_val = nir_load_shared(b, 1, 32, es_vertex_lds_addr, .base = lds_es_arg_0 + 4u * i);
nir_store_var(b, repacked_arg_vars[i], arg_val, 0x1u);
}
}
@ -766,7 +766,7 @@ compact_vertices_after_culling(nir_builder *b,
/* Load the index of the ES threads that will export the current GS thread's vertices */
for (unsigned v = 0; v < 3; ++v) {
nir_ssa_def *vtx_addr = nir_load_var(b, gs_vtxaddr_vars[v]);
nir_ssa_def *exporter_vtx_idx = nir_build_load_shared(b, 1, 8, vtx_addr, .base = lds_es_exporter_tid);
nir_ssa_def *exporter_vtx_idx = nir_load_shared(b, 1, 8, vtx_addr, .base = lds_es_exporter_tid);
exporter_vtx_indices[v] = nir_u2u32(b, exporter_vtx_idx);
}
@ -776,7 +776,7 @@ compact_vertices_after_culling(nir_builder *b,
nir_pop_if(b, if_gs_accepted);
nir_store_var(b, es_accepted_var, es_survived, 0x1u);
nir_store_var(b, gs_accepted_var, nir_bcsel(b, fully_culled, nir_imm_false(b), nir_build_has_input_primitive_amd(b)), 0x1u);
nir_store_var(b, gs_accepted_var, nir_bcsel(b, fully_culled, nir_imm_false(b), nir_has_input_primitive_amd(b)), 0x1u);
}
static void
@ -1081,7 +1081,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
b->cursor = nir_before_cf_list(&impl->body);
nir_ssa_def *es_thread = nir_build_has_input_vertex_amd(b);
nir_ssa_def *es_thread = nir_has_input_vertex_amd(b);
nir_if *if_es_thread = nir_push_if(b, es_thread);
{
/* Initialize the position output variable to zeroes, in case not all VS/TES invocations store the output.
@ -1097,16 +1097,16 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
/* Remember the current thread's shader arguments */
if (b->shader->info.stage == MESA_SHADER_VERTEX) {
nir_store_var(b, repacked_arg_vars[0], nir_build_load_vertex_id_zero_base(b), 0x1u);
nir_store_var(b, repacked_arg_vars[0], nir_load_vertex_id_zero_base(b), 0x1u);
if (uses_instance_id)
nir_store_var(b, repacked_arg_vars[1], nir_build_load_instance_id(b), 0x1u);
nir_store_var(b, repacked_arg_vars[1], nir_load_instance_id(b), 0x1u);
} else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
nir_ssa_def *tess_coord = nir_build_load_tess_coord(b);
nir_ssa_def *tess_coord = nir_load_tess_coord(b);
nir_store_var(b, repacked_arg_vars[0], nir_channel(b, tess_coord, 0), 0x1u);
nir_store_var(b, repacked_arg_vars[1], nir_channel(b, tess_coord, 1), 0x1u);
nir_store_var(b, repacked_arg_vars[2], nir_build_load_tess_rel_patch_id_amd(b), 0x1u);
nir_store_var(b, repacked_arg_vars[2], nir_load_tess_rel_patch_id_amd(b), 0x1u);
if (uses_tess_primitive_id)
nir_store_var(b, repacked_arg_vars[3], nir_build_load_primitive_id(b), 0x1u);
nir_store_var(b, repacked_arg_vars[3], nir_load_primitive_id(b), 0x1u);
} else {
unreachable("Should be VS or TES.");
}
@ -1114,7 +1114,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
nir_pop_if(b, if_es_thread);
nir_store_var(b, es_accepted_var, es_thread, 0x1u);
nir_store_var(b, gs_accepted_var, nir_build_has_input_primitive_amd(b), 0x1u);
nir_store_var(b, gs_accepted_var, nir_has_input_primitive_amd(b), 0x1u);
/* Remove all non-position outputs, and put the position output into the variable. */
nir_metadata_preserve(impl, nir_metadata_none);
@ -1128,24 +1128,24 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
* by the following NIR intrinsic.
*/
nir_if *if_cull_en = nir_push_if(b, nir_build_load_cull_any_enabled_amd(b));
nir_if *if_cull_en = nir_push_if(b, nir_load_cull_any_enabled_amd(b));
{
nir_ssa_def *invocation_index = nir_build_load_local_invocation_index(b);
nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
nir_ssa_def *es_vertex_lds_addr = pervertex_lds_addr(b, invocation_index, pervertex_lds_bytes);
/* ES invocations store their vertex data to LDS for GS threads to read. */
if_es_thread = nir_push_if(b, nir_build_has_input_vertex_amd(b));
if_es_thread = nir_push_if(b, nir_has_input_vertex_amd(b));
{
/* Store position components that are relevant to culling in LDS */
nir_ssa_def *pre_cull_pos = nir_load_var(b, position_value_var);
nir_ssa_def *pre_cull_w = nir_channel(b, pre_cull_pos, 3);
nir_build_store_shared(b, pre_cull_w, es_vertex_lds_addr, .base = lds_es_pos_w);
nir_store_shared(b, pre_cull_w, es_vertex_lds_addr, .base = lds_es_pos_w);
nir_ssa_def *pre_cull_x_div_w = nir_fdiv(b, nir_channel(b, pre_cull_pos, 0), pre_cull_w);
nir_ssa_def *pre_cull_y_div_w = nir_fdiv(b, nir_channel(b, pre_cull_pos, 1), pre_cull_w);
nir_build_store_shared(b, nir_vec2(b, pre_cull_x_div_w, pre_cull_y_div_w), es_vertex_lds_addr, .base = lds_es_pos_x);
nir_store_shared(b, nir_vec2(b, pre_cull_x_div_w, pre_cull_y_div_w), es_vertex_lds_addr, .base = lds_es_pos_x);
/* Clear out the ES accepted flag in LDS */
nir_build_store_shared(b, nir_imm_zero(b, 1, 8), es_vertex_lds_addr, .align_mul = 4, .base = lds_es_vertex_accepted);
nir_store_shared(b, nir_imm_zero(b, 1, 8), es_vertex_lds_addr, .align_mul = 4, .base = lds_es_vertex_accepted);
}
nir_pop_if(b, if_es_thread);
@ -1156,7 +1156,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
nir_store_var(b, prim_exp_arg_var, nir_imm_int(b, 1u << 31), 0x1u);
/* GS invocations load the vertex data and perform the culling. */
nir_if *if_gs_thread = nir_push_if(b, nir_build_has_input_primitive_amd(b));
nir_if *if_gs_thread = nir_push_if(b, nir_has_input_primitive_amd(b));
{
/* Load vertex indices from input VGPRs */
nir_ssa_def *vtx_idx[3] = {0};
@ -1169,13 +1169,13 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
/* Load W positions of vertices first because the culling code will use these first */
for (unsigned vtx = 0; vtx < 3; ++vtx) {
vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes);
pos[vtx][3] = nir_build_load_shared(b, 1, 32, vtx_addr[vtx], .base = lds_es_pos_w);
pos[vtx][3] = nir_load_shared(b, 1, 32, vtx_addr[vtx], .base = lds_es_pos_w);
nir_store_var(b, gs_vtxaddr_vars[vtx], vtx_addr[vtx], 0x1u);
}
/* Load the X/W, Y/W positions of vertices */
for (unsigned vtx = 0; vtx < 3; ++vtx) {
nir_ssa_def *xy = nir_build_load_shared(b, 2, 32, vtx_addr[vtx], .base = lds_es_pos_x);
nir_ssa_def *xy = nir_load_shared(b, 2, 32, vtx_addr[vtx], .base = lds_es_pos_x);
pos[vtx][0] = nir_channel(b, xy, 0);
pos[vtx][1] = nir_channel(b, xy, 1);
}
@ -1188,7 +1188,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
{
/* Store the accepted state to LDS for ES threads */
for (unsigned vtx = 0; vtx < 3; ++vtx)
nir_build_store_shared(b, nir_imm_intN_t(b, 0xff, 8), vtx_addr[vtx], .base = lds_es_vertex_accepted, .align_mul = 4u);
nir_store_shared(b, nir_imm_intN_t(b, 0xff, 8), vtx_addr[vtx], .base = lds_es_vertex_accepted, .align_mul = 4u);
}
nir_pop_if(b, if_gs_accepted);
}
@ -1200,9 +1200,9 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
nir_store_var(b, es_accepted_var, nir_imm_bool(b, false), 0x1u);
/* ES invocations load their accepted flag from LDS. */
if_es_thread = nir_push_if(b, nir_build_has_input_vertex_amd(b));
if_es_thread = nir_push_if(b, nir_has_input_vertex_amd(b));
{
nir_ssa_def *accepted = nir_build_load_shared(b, 1, 8u, es_vertex_lds_addr, .base = lds_es_vertex_accepted, .align_mul = 4u);
nir_ssa_def *accepted = nir_load_shared(b, 1, 8u, es_vertex_lds_addr, .base = lds_es_vertex_accepted, .align_mul = 4u);
nir_ssa_def *accepted_bool = nir_ine(b, accepted, nir_imm_intN_t(b, 0, 8));
nir_store_var(b, es_accepted_var, accepted_bool, 0x1u);
}
@ -1217,14 +1217,14 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
nir_ssa_def *es_exporter_tid = rep.repacked_invocation_index;
/* If all vertices are culled, set primitive count to 0 as well. */
nir_ssa_def *num_exported_prims = nir_build_load_workgroup_num_input_primitives_amd(b);
nir_ssa_def *num_exported_prims = nir_load_workgroup_num_input_primitives_amd(b);
nir_ssa_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), num_exported_prims);
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_int(b, 0)));
{
/* Tell the final vertex and primitive count to the HW. */
nir_build_alloc_vertices_and_primitives_amd(b, num_live_vertices_in_workgroup, num_exported_prims);
nir_alloc_vertices_and_primitives_amd(b, num_live_vertices_in_workgroup, num_exported_prims);
}
nir_pop_if(b, if_wave_0);
@ -1238,11 +1238,11 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
nir_push_else(b, if_cull_en);
{
/* When culling is disabled, we do the same as we would without culling. */
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_int(b, 0)));
{
nir_ssa_def *vtx_cnt = nir_build_load_workgroup_num_input_vertices_amd(b);
nir_ssa_def *prim_cnt = nir_build_load_workgroup_num_input_primitives_amd(b);
nir_build_alloc_vertices_and_primitives_amd(b, vtx_cnt, prim_cnt);
nir_ssa_def *vtx_cnt = nir_load_workgroup_num_input_vertices_amd(b);
nir_ssa_def *prim_cnt = nir_load_workgroup_num_input_primitives_amd(b);
nir_alloc_vertices_and_primitives_amd(b, vtx_cnt, prim_cnt);
}
nir_pop_if(b, if_wave_0);
nir_store_var(b, prim_exp_arg_var, emit_ngg_nogs_prim_exp_arg(b, nogs_state), 0x1u);
@ -1268,11 +1268,11 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
if (b->shader->info.stage == MESA_SHADER_VERTEX)
nogs_state->overwrite_args =
nir_build_overwrite_vs_arguments_amd(b,
nir_overwrite_vs_arguments_amd(b,
nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]));
else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL)
nogs_state->overwrite_args =
nir_build_overwrite_tes_arguments_amd(b,
nir_overwrite_tes_arguments_amd(b,
nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]),
nir_load_var(b, repacked_arg_vars[2]), nir_load_var(b, repacked_arg_vars[3]));
else
@ -1341,11 +1341,11 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
if (!can_cull) {
/* Allocate export space on wave 0 - confirm to the HW that we want to use all possible space */
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_int(b, 0)));
{
nir_ssa_def *vtx_cnt = nir_build_load_workgroup_num_input_vertices_amd(b);
nir_ssa_def *prim_cnt = nir_build_load_workgroup_num_input_primitives_amd(b);
nir_build_alloc_vertices_and_primitives_amd(b, vtx_cnt, prim_cnt);
nir_ssa_def *vtx_cnt = nir_load_workgroup_num_input_vertices_amd(b);
nir_ssa_def *prim_cnt = nir_load_workgroup_num_input_primitives_amd(b);
nir_alloc_vertices_and_primitives_amd(b, vtx_cnt, prim_cnt);
}
nir_pop_if(b, if_wave_0);
@ -1363,7 +1363,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
}
nir_intrinsic_instr *export_vertex_instr;
nir_ssa_def *es_thread = can_cull ? nir_load_var(b, es_accepted_var) : nir_build_has_input_vertex_amd(b);
nir_ssa_def *es_thread = can_cull ? nir_load_var(b, es_accepted_var) : nir_has_input_vertex_amd(b);
nir_if *if_es_thread = nir_push_if(b, es_thread);
{
@ -1372,7 +1372,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
/* Export all vertex attributes (except primitive ID) */
export_vertex_instr = nir_build_export_vertex_amd(b);
export_vertex_instr = nir_export_vertex_amd(b);
/* Export primitive ID (in case of early primitive export or TES) */
if (state.export_prim_id && (state.early_prim_export || shader->info.stage != MESA_SHADER_VERTEX))
@ -1384,7 +1384,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
if (!state.early_prim_export) {
emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var));
if (state.export_prim_id && shader->info.stage == MESA_SHADER_VERTEX) {
if_es_thread = nir_push_if(b, can_cull ? es_thread : nir_build_has_input_vertex_amd(b));
if_es_thread = nir_push_if(b, can_cull ? es_thread : nir_has_input_vertex_amd(b));
emit_store_ngg_nogs_es_primitive_id(b);
nir_pop_if(b, if_es_thread);
}
@ -1405,7 +1405,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var);
nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 };
nir_build_store_output(b, pos_val, nir_imm_int(b, 0), .base = VARYING_SLOT_POS, .component = 0, .io_semantics = io_sem);
nir_store_output(b, pos_val, nir_imm_int(b, 0), .base = VARYING_SLOT_POS, .component = 0, .io_semantics = io_sem);
}
nir_metadata_preserve(impl, nir_metadata_none);
@ -1457,7 +1457,7 @@ ngg_gs_out_vertex_addr(nir_builder *b, nir_ssa_def *out_vtx_idx, lower_ngg_gs_st
static nir_ssa_def *
ngg_gs_emit_vertex_addr(nir_builder *b, nir_ssa_def *gs_vtx_idx, lower_ngg_gs_state *s)
{
nir_ssa_def *tid_in_tg = nir_build_load_local_invocation_index(b);
nir_ssa_def *tid_in_tg = nir_load_local_invocation_index(b);
nir_ssa_def *gs_out_vtx_base = nir_imul_imm(b, tid_in_tg, b->shader->info.gs.vertices_out);
nir_ssa_def *out_vtx_idx = nir_iadd_nuw(b, gs_out_vtx_base, gs_vtx_idx);
@ -1480,7 +1480,7 @@ ngg_gs_clear_primflags(nir_builder *b, nir_ssa_def *num_vertices, unsigned strea
nir_push_else(b, if_break);
{
nir_ssa_def *emit_vtx_addr = ngg_gs_emit_vertex_addr(b, current_clear_primflag_idx, s);
nir_build_store_shared(b, zero_u8, emit_vtx_addr, .base = s->lds_offs_primflags + stream);
nir_store_shared(b, zero_u8, emit_vtx_addr, .base = s->lds_offs_primflags + stream);
nir_store_var(b, s->current_clear_primflag_idx_var, nir_iadd_imm_nuw(b, current_clear_primflag_idx, 1), 0x1u);
}
nir_pop_if(b, if_break);
@ -1491,7 +1491,7 @@ ngg_gs_clear_primflags(nir_builder *b, nir_ssa_def *num_vertices, unsigned strea
static void
ngg_gs_shader_query(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg_gs_state *s)
{
nir_if *if_shader_query = nir_push_if(b, nir_build_load_shader_query_enabled_amd(b));
nir_if *if_shader_query = nir_push_if(b, nir_load_shader_query_enabled_amd(b));
nir_ssa_def *num_prims_in_wave = NULL;
/* Calculate the "real" number of emitted primitives from the emitted GS vertices and primitives.
@ -1502,19 +1502,19 @@ ngg_gs_shader_query(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg_gs_st
unsigned gs_vtx_cnt = nir_src_as_uint(intrin->src[0]);
unsigned gs_prm_cnt = nir_src_as_uint(intrin->src[1]);
unsigned total_prm_cnt = gs_vtx_cnt - gs_prm_cnt * (s->num_vertices_per_primitive - 1u);
nir_ssa_def *num_threads = nir_bit_count(b, nir_build_ballot(b, 1, s->wave_size, nir_imm_bool(b, true)));
nir_ssa_def *num_threads = nir_bit_count(b, nir_ballot(b, 1, s->wave_size, nir_imm_bool(b, true)));
num_prims_in_wave = nir_imul_imm(b, num_threads, total_prm_cnt);
} else {
nir_ssa_def *gs_vtx_cnt = intrin->src[0].ssa;
nir_ssa_def *prm_cnt = intrin->src[1].ssa;
if (s->num_vertices_per_primitive > 1)
prm_cnt = nir_iadd_nuw(b, nir_imul_imm(b, prm_cnt, -1u * (s->num_vertices_per_primitive - 1)), gs_vtx_cnt);
num_prims_in_wave = nir_build_reduce(b, prm_cnt, .reduction_op = nir_op_iadd);
num_prims_in_wave = nir_reduce(b, prm_cnt, .reduction_op = nir_op_iadd);
}
/* Store the query result to GDS using an atomic add. */
nir_if *if_first_lane = nir_push_if(b, nir_build_elect(b, 1));
nir_build_gds_atomic_add_amd(b, 32, num_prims_in_wave, nir_imm_int(b, 0), nir_imm_int(b, 0x100));
nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
nir_gds_atomic_add_amd(b, 32, num_prims_in_wave, nir_imm_int(b, 0), nir_imm_int(b, 0x100));
nir_pop_if(b, if_first_lane);
nir_pop_if(b, if_shader_query);
@ -1604,7 +1604,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
if (info->bit_size != 32)
out_val = nir_u2u(b, out_val, info->bit_size);
nir_build_store_shared(b, out_val, gs_emit_vtx_addr, .base = packed_location * 16 + comp * 4);
nir_store_shared(b, out_val, gs_emit_vtx_addr, .base = packed_location * 16 + comp * 4);
/* Clear the variable that holds the output */
nir_store_var(b, s->output_vars[slot][comp], nir_ssa_undef(b, 1, 32), 0x1u);
@ -1625,7 +1625,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
prim_flag = nir_iadd_nuw(b, prim_flag, nir_ishl(b, odd, nir_imm_int(b, 1)));
}
nir_build_store_shared(b, nir_u2u8(b, prim_flag), gs_emit_vtx_addr, .base = s->lds_offs_primflags + stream, .align_mul = 4u);
nir_store_shared(b, nir_u2u8(b, prim_flag), gs_emit_vtx_addr, .base = s->lds_offs_primflags + stream, .align_mul = 4u);
nir_instr_remove(&intrin->instr);
return true;
}
@ -1724,7 +1724,7 @@ ngg_gs_export_primitives(nir_builder *b, nir_ssa_def *max_num_out_prims, nir_ssa
}
nir_ssa_def *arg = emit_pack_ngg_prim_exp_arg(b, s->num_vertices_per_primitive, vtx_indices, is_null_prim, false);
nir_build_export_primitive_amd(b, arg);
nir_export_primitive_amd(b, arg);
nir_pop_if(b, if_prim_export_thread);
}
@ -1740,7 +1740,7 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
* The current thread will export a vertex that was live in another invocation.
* Load the index of the vertex that the current thread will have to export.
*/
nir_ssa_def *exported_vtx_idx = nir_build_load_shared(b, 1, 8, out_vtx_lds_addr, .base = s->lds_offs_primflags + 1);
nir_ssa_def *exported_vtx_idx = nir_load_shared(b, 1, 8, out_vtx_lds_addr, .base = s->lds_offs_primflags + 1);
exported_out_vtx_lds_addr = ngg_gs_out_vertex_addr(b, nir_u2u32(b, exported_vtx_idx), s);
}
@ -1756,12 +1756,12 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
if (info->stream != 0 || info->bit_size == 0)
continue;
nir_ssa_def *load = nir_build_load_shared(b, 1, info->bit_size, exported_out_vtx_lds_addr, .base = packed_location * 16u + comp * 4u, .align_mul = 4u);
nir_build_store_output(b, load, nir_imm_int(b, 0), .base = slot, .component = comp, .io_semantics = io_sem);
nir_ssa_def *load = nir_load_shared(b, 1, info->bit_size, exported_out_vtx_lds_addr, .base = packed_location * 16u + comp * 4u, .align_mul = 4u);
nir_store_output(b, load, nir_imm_int(b, 0), .base = slot, .component = comp, .io_semantics = io_sem);
}
}
nir_build_export_vertex_amd(b);
nir_export_vertex_amd(b);
nir_pop_if(b, if_vtx_export_thread);
}
@ -1779,7 +1779,7 @@ ngg_gs_setup_vertex_compaction(nir_builder *b, nir_ssa_def *vertex_live, nir_ssa
nir_ssa_def *exporter_lds_addr = ngg_gs_out_vertex_addr(b, exporter_tid_in_tg, s);
nir_ssa_def *tid_in_tg_u8 = nir_u2u8(b, tid_in_tg);
nir_build_store_shared(b, tid_in_tg_u8, exporter_lds_addr, .base = s->lds_offs_primflags + 1);
nir_store_shared(b, tid_in_tg_u8, exporter_lds_addr, .base = s->lds_offs_primflags + 1);
}
nir_pop_if(b, if_vertex_live);
}
@ -1791,7 +1791,7 @@ ngg_gs_load_out_vtx_primflag_0(nir_builder *b, nir_ssa_def *tid_in_tg, nir_ssa_d
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_if *if_outvtx_thread = nir_push_if(b, nir_ilt(b, tid_in_tg, max_num_out_vtx));
nir_ssa_def *primflag_0 = nir_build_load_shared(b, 1, 8, vtx_lds_addr, .base = s->lds_offs_primflags, .align_mul = 4u);
nir_ssa_def *primflag_0 = nir_load_shared(b, 1, 8, vtx_lds_addr, .base = s->lds_offs_primflags, .align_mul = 4u);
primflag_0 = nir_u2u32(b, primflag_0);
nir_pop_if(b, if_outvtx_thread);
@ -1801,8 +1801,8 @@ ngg_gs_load_out_vtx_primflag_0(nir_builder *b, nir_ssa_def *tid_in_tg, nir_ssa_d
static void
ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s)
{
nir_ssa_def *tid_in_tg = nir_build_load_local_invocation_index(b);
nir_ssa_def *max_vtxcnt = nir_build_load_workgroup_num_input_vertices_amd(b);
nir_ssa_def *tid_in_tg = nir_load_local_invocation_index(b);
nir_ssa_def *max_vtxcnt = nir_load_workgroup_num_input_vertices_amd(b);
nir_ssa_def *max_prmcnt = max_vtxcnt; /* They are currently practically the same; both RADV and RadeonSI do this. */
nir_ssa_def *out_vtx_lds_addr = ngg_gs_out_vertex_addr(b, tid_in_tg, s);
@ -1810,8 +1810,8 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s)
/* When the output is compile-time known, the GS writes all possible vertices and primitives it can.
* The gs_alloc_req needs to happen on one wave only, otherwise the HW hangs.
*/
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_zero(b, 1, 32)));
nir_build_alloc_vertices_and_primitives_amd(b, max_vtxcnt, max_prmcnt);
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_zero(b, 1, 32)));
nir_alloc_vertices_and_primitives_amd(b, max_vtxcnt, max_prmcnt);
nir_pop_if(b, if_wave_0);
}
@ -1843,8 +1843,8 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s)
max_prmcnt = nir_bcsel(b, any_output, max_prmcnt, nir_imm_int(b, 0));
/* Allocate export space. We currently don't compact primitives, just use the maximum number. */
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_zero(b, 1, 32)));
nir_build_alloc_vertices_and_primitives_amd(b, workgroup_num_vertices, max_prmcnt);
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_load_subgroup_id(b), nir_imm_zero(b, 1, 32)));
nir_alloc_vertices_and_primitives_amd(b, workgroup_num_vertices, max_prmcnt);
nir_pop_if(b, if_wave_0);
/* Vertex compaction. This makes sure there are no gaps between threads that export vertices. */
@ -1914,7 +1914,7 @@ ac_nir_lower_ngg_gs(nir_shader *shader,
.memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
/* Wrap the GS control flow. */
nir_if *if_gs_thread = nir_push_if(b, nir_build_has_input_primitive_amd(b));
nir_if *if_gs_thread = nir_push_if(b, nir_has_input_primitive_amd(b));
/* Create and initialize output variables */
for (unsigned slot = 0; slot < VARYING_SLOT_MAX; ++slot) {
@ -1968,7 +1968,7 @@ lower_ms_store_output(nir_builder *b,
assert(base == 0);
nir_ssa_def *addr = nir_imm_int(b, 0);
nir_build_store_shared(b, nir_u2u32(b, store_val), addr, .base = s->numprims_lds_addr);
nir_store_shared(b, nir_u2u32(b, store_val), addr, .base = s->numprims_lds_addr);
} else if (io_sem.location == VARYING_SLOT_PRIMITIVE_INDICES) {
/* Contrary to the name, these are not primitive indices, but
* vertex indices for each vertex of the output primitives.
@ -1976,8 +1976,8 @@ lower_ms_store_output(nir_builder *b,
*/
nir_ssa_def *offset_src = nir_get_io_offset_src(intrin)->ssa;
nir_build_store_shared(b, nir_u2u8(b, store_val), offset_src,
.base = s->prim_vtx_indices_addr + base);
nir_store_shared(b, nir_u2u8(b, store_val), offset_src,
.base = s->prim_vtx_indices_addr + base);
} else {
unreachable("Invalid mesh shader output");
}
@ -2002,10 +2002,10 @@ lower_ms_load_output(nir_builder *b,
assert(base == 0);
nir_ssa_def *addr = nir_imm_int(b, 0);
return nir_build_load_shared(b, 1, 32, addr, .base = s->numprims_lds_addr);
return nir_load_shared(b, 1, 32, addr, .base = s->numprims_lds_addr);
} else if (io_sem.location == VARYING_SLOT_PRIMITIVE_INDICES) {
nir_ssa_def *offset_src = nir_get_io_offset_src(intrin)->ssa;
nir_ssa_def *index = nir_build_load_shared(b, 1, 8, offset_src,
nir_ssa_def *index = nir_load_shared(b, 1, 8, offset_src,
.base = s->prim_vtx_indices_addr + base);
return nir_u2u(b, index, intrin->dest.ssa.bit_size);
}
@ -2117,9 +2117,9 @@ ms_store_arrayed_output_intrin(nir_builder *b,
unsigned const_off = base_shared_addr + component_offset * 4;
nir_build_store_shared(b, store_val, addr, .base = const_off,
.write_mask = write_mask, .align_mul = 16,
.align_offset = const_off % 16);
nir_store_shared(b, store_val, addr, .base = const_off,
.write_mask = write_mask, .align_mul = 16,
.align_offset = const_off % 16);
}
static nir_ssa_def *
@ -2139,9 +2139,9 @@ ms_load_arrayed_output(nir_builder *b,
nir_ssa_def *base_addr_off = nir_imul_imm(b, base_offset, 16);
nir_ssa_def *addr = nir_iadd_nuw(b, base_addr, base_addr_off);
return nir_build_load_shared(b, num_components, load_bit_size, addr, .align_mul = 16,
.align_offset = component_addr_off % 16,
.base = base_shared_addr + component_addr_off);
return nir_load_shared(b, num_components, load_bit_size, addr, .align_mul = 16,
.align_offset = component_addr_off % 16,
.base = base_shared_addr + component_addr_off);
}
static nir_ssa_def *
@ -2304,8 +2304,8 @@ ms_emit_arrayed_outputs(nir_builder *b,
ms_load_arrayed_output(b, invocation_index, zero, driver_location, start_comp,
num_components, 32, num_arrayed_outputs, lds_base_addr);
nir_build_store_output(b, load, nir_imm_int(b, 0), .base = slot, .component = start_comp,
.io_semantics = io_sem);
nir_store_output(b, load, nir_imm_int(b, 0), .base = slot, .component = start_comp,
.io_semantics = io_sem);
}
}
}
@ -2342,27 +2342,27 @@ emit_ms_finale(nir_shader *shader, lower_ngg_ms_state *s)
nir_ssa_def *loaded_num_prm;
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_ssa_def *dont_care = nir_ssa_undef(b, 1, 32);
nir_if *if_elected = nir_push_if(b, nir_build_elect(b, 1));
nir_if *if_elected = nir_push_if(b, nir_elect(b, 1));
{
loaded_num_prm = nir_build_load_shared(b, 1, 32, zero, .base = s->numprims_lds_addr);
loaded_num_prm = nir_load_shared(b, 1, 32, zero, .base = s->numprims_lds_addr);
}
nir_pop_if(b, if_elected);
loaded_num_prm = nir_if_phi(b, loaded_num_prm, dont_care);
nir_ssa_def *num_prm = nir_build_read_first_invocation(b, loaded_num_prm);
nir_ssa_def *num_prm = nir_read_first_invocation(b, loaded_num_prm);
nir_ssa_def *num_vtx = nir_imm_int(b, shader->info.mesh.max_vertices_out);
/* If the shader doesn't actually create any primitives, don't allocate any output. */
num_vtx = nir_bcsel(b, nir_ieq_imm(b, num_prm, 0), nir_imm_int(b, 0), num_vtx);
/* Emit GS_ALLOC_REQ on Wave 0 to let the HW know the output size. */
nir_ssa_def *wave_id = nir_build_load_subgroup_id(b);
nir_ssa_def *wave_id = nir_load_subgroup_id(b);
nir_if *if_wave_0 = nir_push_if(b, nir_ieq_imm(b, wave_id, 0));
{
nir_build_alloc_vertices_and_primitives_amd(b, num_vtx, num_prm);
nir_alloc_vertices_and_primitives_amd(b, num_vtx, num_prm);
}
nir_pop_if(b, if_wave_0);
nir_ssa_def *invocation_index = nir_build_load_local_invocation_index(b);
nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
/* Load vertex/primitive attributes from shared memory and
* emit store_output intrinsics for them.
@ -2379,7 +2379,7 @@ emit_ms_finale(nir_shader *shader, lower_ngg_ms_state *s)
/* All per-vertex attributes. */
ms_emit_arrayed_outputs(b, invocation_index, s->per_vertex_outputs,
s->num_per_vertex_outputs, s->vertex_attr_lds_addr, s);
nir_build_export_vertex_amd(b);
nir_export_vertex_amd(b);
}
nir_pop_if(b, if_has_output_vertex);
@ -2393,14 +2393,14 @@ emit_ms_finale(nir_shader *shader, lower_ngg_ms_state *s)
/* Primitive connectivity data: describes which vertices the primitive uses. */
nir_ssa_def *prim_idx_addr = nir_imul_imm(b, invocation_index, s->vertices_per_prim);
nir_ssa_def *indices_loaded = nir_build_load_shared(b, s->vertices_per_prim, 8, prim_idx_addr, .base = s->prim_vtx_indices_addr);
nir_ssa_def *indices_loaded = nir_load_shared(b, s->vertices_per_prim, 8, prim_idx_addr, .base = s->prim_vtx_indices_addr);
nir_ssa_def *indices[3];
indices[0] = nir_u2u32(b, nir_channel(b, indices_loaded, 0));
indices[1] = s->vertices_per_prim > 1 ? nir_u2u32(b, nir_channel(b, indices_loaded, 1)) : NULL;
indices[2] = s->vertices_per_prim > 2 ? nir_u2u32(b, nir_channel(b, indices_loaded, 2)) : NULL;
nir_ssa_def *prim_exp_arg = emit_pack_ngg_prim_exp_arg(b, s->vertices_per_prim, indices, NULL, false);
nir_build_export_primitive_amd(b, prim_exp_arg);
nir_export_primitive_amd(b, prim_exp_arg);
}
nir_pop_if(b, if_has_output_primitive);
}
@ -2472,7 +2472,7 @@ ac_nir_lower_ngg_ms(nir_shader *shader,
unsigned num_ms_invocations = b->shader->info.workgroup_size[0] *
b->shader->info.workgroup_size[1] *
b->shader->info.workgroup_size[2];
nir_ssa_def *invocation_index = nir_build_load_local_invocation_index(b);
nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
nir_ssa_def *has_ms_invocation = nir_ult(b, invocation_index, nir_imm_int(b, num_ms_invocations));
nir_if *if_has_ms_invocation = nir_push_if(b, has_ms_invocation);
nir_cf_reinsert(&extracted, b->cursor);

View File

@ -215,15 +215,15 @@ lower_ls_output_store(nir_builder *b,
b->cursor = nir_before_instr(instr);
nir_ssa_def *vertex_idx = nir_build_load_local_invocation_index(b);
nir_ssa_def *vertex_idx = nir_load_local_invocation_index(b);
nir_ssa_def *base_off_var = nir_imul_imm(b, vertex_idx, st->tcs_num_reserved_inputs * 16u);
nir_ssa_def *io_off = nir_build_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u);
unsigned write_mask = nir_intrinsic_write_mask(intrin);
nir_ssa_def *off = nir_iadd_nuw(b, base_off_var, io_off);
nir_build_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
/* NOTE: don't remove the store_output intrinsic on GFX9+ when tcs_in_out_eq,
* it will be used by same-invocation TCS input loads.
@ -269,8 +269,8 @@ hs_per_vertex_input_lds_offset(nir_builder *b,
nir_intrinsic_instr *instr)
{
unsigned tcs_in_vertex_stride = st->tcs_num_reserved_inputs * 16u;
nir_ssa_def *tcs_in_vtxcnt = nir_build_load_patch_vertices_in(b);
nir_ssa_def *rel_patch_id = nir_build_load_tess_rel_patch_id_amd(b);
nir_ssa_def *tcs_in_vtxcnt = nir_load_patch_vertices_in(b);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *tcs_in_patch_stride = nir_imul_imm(b, tcs_in_vtxcnt, tcs_in_vertex_stride);
nir_ssa_def *tcs_in_current_patch_offset = nir_imul(b, rel_patch_id, tcs_in_patch_stride);
@ -296,8 +296,8 @@ hs_output_lds_offset(nir_builder *b,
unsigned pervertex_output_patch_size = b->shader->info.tess.tcs_vertices_out * output_vertex_size;
unsigned output_patch_stride = pervertex_output_patch_size + st->tcs_num_reserved_patch_outputs * 16u;
nir_ssa_def *tcs_in_vtxcnt = nir_build_load_patch_vertices_in(b);
nir_ssa_def *tcs_num_patches = nir_build_load_tcs_num_patches_amd(b);
nir_ssa_def *tcs_in_vtxcnt = nir_load_patch_vertices_in(b);
nir_ssa_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_ssa_def *input_patch_size = nir_imul_imm(b, tcs_in_vtxcnt, st->tcs_num_reserved_inputs * 16u);
nir_ssa_def *output_patch0_offset = nir_imul(b, input_patch_size, tcs_num_patches);
@ -305,7 +305,7 @@ hs_output_lds_offset(nir_builder *b,
? nir_build_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u)
: nir_imm_int(b, 0);
nir_ssa_def *rel_patch_id = nir_build_load_tess_rel_patch_id_amd(b);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *patch_offset = nir_imul_imm(b, rel_patch_id, output_patch_stride);
nir_ssa_def *output_patch_offset = nir_iadd_nuw(b, patch_offset, output_patch0_offset);
@ -328,13 +328,13 @@ hs_per_vertex_output_vmem_offset(nir_builder *b,
{
nir_ssa_def *out_vertices_per_patch = b->shader->info.stage == MESA_SHADER_TESS_CTRL
? nir_imm_int(b, b->shader->info.tess.tcs_vertices_out)
: nir_build_load_patch_vertices_in(b);
: nir_load_patch_vertices_in(b);
nir_ssa_def *tcs_num_patches = nir_build_load_tcs_num_patches_amd(b);
nir_ssa_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_ssa_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u));
nir_ssa_def *io_offset = nir_build_calc_io_offset(b, intrin, attr_stride, 4u);
nir_ssa_def *rel_patch_id = nir_build_load_tess_rel_patch_id_amd(b);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *patch_offset = nir_imul(b, rel_patch_id, nir_imul_imm(b, out_vertices_per_patch, 16u));
nir_ssa_def *vertex_index = nir_ssa_for_src(b, *nir_get_io_arrayed_index_src(intrin), 1);
@ -351,9 +351,9 @@ hs_per_patch_output_vmem_offset(nir_builder *b,
{
nir_ssa_def *out_vertices_per_patch = b->shader->info.stage == MESA_SHADER_TESS_CTRL
? nir_imm_int(b, b->shader->info.tess.tcs_vertices_out)
: nir_build_load_patch_vertices_in(b);
: nir_load_patch_vertices_in(b);
nir_ssa_def *tcs_num_patches = nir_build_load_tcs_num_patches_amd(b);
nir_ssa_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
nir_ssa_def *per_vertex_output_patch_size = nir_imul_imm(b, out_vertices_per_patch, st->tcs_num_reserved_outputs * 16u);
nir_ssa_def *per_patch_data_offset = nir_imul(b, tcs_num_patches, per_vertex_output_patch_size);
@ -364,7 +364,7 @@ hs_per_patch_output_vmem_offset(nir_builder *b,
if (const_base_offset)
off = nir_iadd_nuw(b, off, nir_imul_imm(b, tcs_num_patches, const_base_offset));
nir_ssa_def *rel_patch_id = nir_build_load_tess_rel_patch_id_amd(b);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *patch_offset = nir_imul_imm(b, rel_patch_id, 16u);
off = nir_iadd_nuw(b, off, per_patch_data_offset);
return nir_iadd_nuw(b, off, patch_offset);
@ -379,8 +379,8 @@ lower_hs_per_vertex_input_load(nir_builder *b,
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
nir_ssa_def *off = hs_per_vertex_input_lds_offset(b, st, intrin);
return nir_build_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
}
static void
@ -404,9 +404,9 @@ lower_hs_output_store(nir_builder *b,
? hs_per_vertex_output_vmem_offset(b, st, intrin)
: hs_per_patch_output_vmem_offset(b, st, intrin, 0);
nir_ssa_def *hs_ring_tess_offchip = nir_build_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_build_load_ring_tess_offchip_offset_amd(b);
nir_build_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, .write_mask = write_mask, .memory_modes = nir_var_shader_out);
nir_ssa_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_store_buffer_amd(b, store_val, hs_ring_tess_offchip, vmem_off, offchip_offset, .write_mask = write_mask, .memory_modes = nir_var_shader_out);
}
if (write_to_lds) {
@ -417,8 +417,8 @@ lower_hs_output_store(nir_builder *b,
st->tcs_tess_lvl_out_loc = nir_intrinsic_base(intrin) * 16u;
nir_ssa_def *lds_off = hs_output_lds_offset(b, st, intrin);
nir_build_store_shared(b, store_val, lds_off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
nir_store_shared(b, store_val, lds_off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
}
}
@ -428,8 +428,8 @@ lower_hs_output_load(nir_builder *b,
lower_tess_io_state *st)
{
nir_ssa_def *off = hs_output_lds_offset(b, st, intrin);
return nir_build_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
}
static void
@ -507,27 +507,27 @@ hs_emit_write_tess_factors(nir_shader *shader,
nir_scoped_barrier(b, .execution_scope=NIR_SCOPE_WORKGROUP, .memory_scope=NIR_SCOPE_WORKGROUP,
.memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_shader_out|nir_var_mem_shared);
nir_ssa_def *invocation_id = nir_build_load_invocation_id(b);
nir_ssa_def *invocation_id = nir_load_invocation_id(b);
/* Only the 1st invocation of each patch needs to do this. */
nir_if *invocation_id_zero = nir_push_if(b, nir_ieq_imm(b, invocation_id, 0));
/* The descriptor where tess factors have to be stored by the shader. */
nir_ssa_def *tessfactor_ring = nir_build_load_ring_tess_factors_amd(b);
nir_ssa_def *tessfactor_ring = nir_load_ring_tess_factors_amd(b);
/* Base LDS address of per-patch outputs in the current patch. */
nir_ssa_def *lds_base = hs_output_lds_offset(b, st, NULL);
/* Load all tessellation factors (aka. tess levels) from LDS. */
nir_ssa_def *tessfactors_outer = nir_build_load_shared(b, outer_comps, 32, lds_base, .base = st->tcs_tess_lvl_out_loc,
.align_mul = 16u, .align_offset = st->tcs_tess_lvl_out_loc % 16u);
nir_ssa_def *tessfactors_outer = nir_load_shared(b, outer_comps, 32, lds_base, .base = st->tcs_tess_lvl_out_loc,
.align_mul = 16u, .align_offset = st->tcs_tess_lvl_out_loc % 16u);
nir_ssa_def *tessfactors_inner = inner_comps
? nir_build_load_shared(b, inner_comps, 32, lds_base, .base = st->tcs_tess_lvl_in_loc,
.align_mul = 16u, .align_offset = st->tcs_tess_lvl_in_loc % 16u)
? nir_load_shared(b, inner_comps, 32, lds_base, .base = st->tcs_tess_lvl_in_loc,
.align_mul = 16u, .align_offset = st->tcs_tess_lvl_in_loc % 16u)
: NULL;
nir_ssa_def *rel_patch_id = nir_build_load_tess_rel_patch_id_amd(b);
nir_ssa_def *tess_factors_base = nir_build_load_ring_tess_factors_offset_amd(b);
nir_ssa_def *rel_patch_id = nir_load_tess_rel_patch_id_amd(b);
nir_ssa_def *tess_factors_base = nir_load_ring_tess_factors_offset_amd(b);
nir_ssa_def *tess_factors_offset = nir_imul_imm(b, rel_patch_id, (inner_comps + outer_comps) * 4u);
unsigned tess_factors_const_offset = 0;
@ -535,7 +535,7 @@ hs_emit_write_tess_factors(nir_shader *shader,
/* Store the dynamic HS control word. */
nir_if *rel_patch_id_zero = nir_push_if(b, nir_ieq_imm(b, rel_patch_id, 0));
nir_ssa_def *ctrlw = nir_imm_int(b, 0x80000000u);
nir_build_store_buffer_amd(b, ctrlw, tessfactor_ring, nir_imm_zero(b, 1, 32), tess_factors_base);
nir_store_buffer_amd(b, ctrlw, tessfactor_ring, nir_imm_zero(b, 1, 32), tess_factors_base);
tess_factors_const_offset += 4;
nir_pop_if(b, rel_patch_id_zero);
}
@ -544,27 +544,27 @@ hs_emit_write_tess_factors(nir_shader *shader,
if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
/* LINES reversal */
nir_ssa_def *t = nir_vec2(b, nir_channel(b, tessfactors_outer, 1), nir_channel(b, tessfactors_outer, 0));
nir_build_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
} else if (shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
nir_ssa_def *t = nir_vec4(b, nir_channel(b, tessfactors_outer, 0), nir_channel(b, tessfactors_outer, 1),
nir_channel(b, tessfactors_outer, 2), nir_channel(b, tessfactors_inner, 0));
nir_build_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
nir_store_buffer_amd(b, t, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
} else {
nir_build_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
nir_build_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset + 4u * outer_comps);
nir_store_buffer_amd(b, tessfactors_outer, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset);
nir_store_buffer_amd(b, tessfactors_inner, tessfactor_ring, tess_factors_offset, tess_factors_base, .base = tess_factors_const_offset + 4u * outer_comps);
}
if (st->tes_reads_tessfactors) {
/* Store to offchip for TES to read - only if TES actually reads them */
nir_ssa_def *hs_ring_tess_offchip = nir_build_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_build_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *hs_ring_tess_offchip = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_out_loc);
nir_build_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, .memory_modes = nir_var_shader_out);
nir_store_buffer_amd(b, tessfactors_outer, hs_ring_tess_offchip, vmem_off_outer, offchip_offset, .memory_modes = nir_var_shader_out);
if (inner_comps) {
nir_ssa_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, NULL, st->tcs_tess_lvl_in_loc);
nir_build_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, .memory_modes = nir_var_shader_out);
nir_store_buffer_amd(b, tessfactors_inner, hs_ring_tess_offchip, vmem_off_inner, offchip_offset, .memory_modes = nir_var_shader_out);
}
}
@ -581,13 +581,13 @@ lower_tes_input_load(nir_builder *b,
lower_tess_io_state *st = (lower_tess_io_state *) state;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
nir_ssa_def *offchip_ring = nir_build_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_build_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *offchip_ring = nir_load_ring_tess_offchip_amd(b);
nir_ssa_def *offchip_offset = nir_load_ring_tess_offchip_offset_amd(b);
nir_ssa_def *off = intrin->intrinsic == nir_intrinsic_load_per_vertex_input
? hs_per_vertex_output_vmem_offset(b, st, intrin)
: hs_per_patch_output_vmem_offset(b, st, intrin, 0);
return nir_build_load_buffer_amd(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, offchip_ring, off, offchip_offset);
return nir_load_buffer_amd(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, offchip_ring, off, offchip_offset);
}
static bool