radeonsi: don't execute LDS stores for TCS outputs that are never read
This is a per-component version of the previous mechanism. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6340>
This commit is contained in:
parent
08ee72100f
commit
562b8c1a47
|
@ -328,6 +328,7 @@ struct si_shader_info {
|
||||||
ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
|
ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
|
||||||
ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
|
ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
|
||||||
ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
|
ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
|
||||||
|
ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
|
||||||
ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
|
ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
|
||||||
|
|
||||||
ubyte color_interpolate[2];
|
ubyte color_interpolate[2];
|
||||||
|
@ -342,13 +343,6 @@ struct si_shader_info {
|
||||||
|
|
||||||
uint num_memory_instructions; /**< sampler, buffer, and image instructions */
|
uint num_memory_instructions; /**< sampler, buffer, and image instructions */
|
||||||
|
|
||||||
/**
|
|
||||||
* If a tessellation control shader reads outputs, this describes which ones.
|
|
||||||
*/
|
|
||||||
bool reads_pervertex_outputs;
|
|
||||||
bool reads_perpatch_outputs;
|
|
||||||
bool reads_tessfactor_outputs;
|
|
||||||
|
|
||||||
ubyte colors_read; /**< which color components are read by the FS */
|
ubyte colors_read; /**< which color components are read by the FS */
|
||||||
ubyte colors_written;
|
ubyte colors_written;
|
||||||
bool reads_samplemask; /**< does fragment shader read sample mask? */
|
bool reads_samplemask; /**< does fragment shader read sample mask? */
|
||||||
|
|
|
@ -518,7 +518,6 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_
|
||||||
LLVMValueRef dw_addr, stride;
|
LLVMValueRef dw_addr, stride;
|
||||||
LLVMValueRef buffer, base, addr;
|
LLVMValueRef buffer, base, addr;
|
||||||
LLVMValueRef values[8];
|
LLVMValueRef values[8];
|
||||||
bool skip_lds_store;
|
|
||||||
bool is_tess_factor = false, is_tess_inner = false;
|
bool is_tess_factor = false, is_tess_inner = false;
|
||||||
|
|
||||||
driver_location = driver_location / 4;
|
driver_location = driver_location / 4;
|
||||||
|
@ -541,23 +540,16 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_
|
||||||
dw_addr = get_tcs_out_current_patch_offset(ctx);
|
dw_addr = get_tcs_out_current_patch_offset(ctx);
|
||||||
dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
|
dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
|
||||||
name, index);
|
name, index);
|
||||||
|
|
||||||
skip_lds_store = !info->reads_pervertex_outputs;
|
|
||||||
} else {
|
} else {
|
||||||
dw_addr = get_tcs_out_current_patch_data_offset(ctx);
|
dw_addr = get_tcs_out_current_patch_data_offset(ctx);
|
||||||
dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
|
dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
|
||||||
name, index);
|
name, index);
|
||||||
|
|
||||||
skip_lds_store = !info->reads_perpatch_outputs;
|
|
||||||
|
|
||||||
if (is_const && const_index == 0) {
|
if (is_const && const_index == 0) {
|
||||||
int name = info->output_semantic_name[driver_location];
|
int name = info->output_semantic_name[driver_location];
|
||||||
|
|
||||||
/* Always write tess factors into LDS for the TCS epilog. */
|
/* Always write tess factors into LDS for the TCS epilog. */
|
||||||
if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) {
|
if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) {
|
||||||
/* The epilog doesn't read LDS if invocation 0 defines tess factors. */
|
|
||||||
skip_lds_store = !info->reads_tessfactor_outputs &&
|
|
||||||
ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
|
|
||||||
is_tess_factor = true;
|
is_tess_factor = true;
|
||||||
is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
|
is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
|
||||||
}
|
}
|
||||||
|
@ -585,7 +577,10 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Skip LDS stores if there is no LDS read of this output. */
|
/* Skip LDS stores if there is no LDS read of this output. */
|
||||||
if (!skip_lds_store)
|
if (info->output_readmask[driver_location + chan / 4] & (1 << (chan % 4)) ||
|
||||||
|
/* The epilog reads LDS if invocation 0 doesn't define tess factors. */
|
||||||
|
(is_tess_factor &&
|
||||||
|
!ctx->shader->selector->info.tessfactors_are_def_in_all_invocs))
|
||||||
lshs_lds_store(ctx, chan, dw_addr, value);
|
lshs_lds_store(ctx, chan, dw_addr, value);
|
||||||
|
|
||||||
value = ac_to_integer(&ctx->ac, value);
|
value = ac_to_integer(&ctx->ac, value);
|
||||||
|
|
|
@ -64,16 +64,18 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned mask, bit_size;
|
unsigned mask, bit_size;
|
||||||
bool dual_slot;
|
bool dual_slot, is_output_load;
|
||||||
|
|
||||||
if (nir_intrinsic_infos[intr->intrinsic].index_map[NIR_INTRINSIC_WRMASK] > 0) {
|
if (nir_intrinsic_infos[intr->intrinsic].index_map[NIR_INTRINSIC_WRMASK] > 0) {
|
||||||
mask = nir_intrinsic_write_mask(intr); /* store */
|
mask = nir_intrinsic_write_mask(intr); /* store */
|
||||||
bit_size = nir_src_bit_size(intr->src[0]);
|
bit_size = nir_src_bit_size(intr->src[0]);
|
||||||
dual_slot = bit_size == 64 && nir_src_num_components(intr->src[0]) >= 3;
|
dual_slot = bit_size == 64 && nir_src_num_components(intr->src[0]) >= 3;
|
||||||
|
is_output_load = false;
|
||||||
} else {
|
} else {
|
||||||
mask = nir_ssa_def_components_read(&intr->dest.ssa); /* load */
|
mask = nir_ssa_def_components_read(&intr->dest.ssa); /* load */
|
||||||
bit_size = intr->dest.ssa.bit_size;
|
bit_size = intr->dest.ssa.bit_size;
|
||||||
dual_slot = bit_size == 64 && intr->dest.ssa.num_components >= 3;
|
dual_slot = bit_size == 64 && intr->dest.ssa.num_components >= 3;
|
||||||
|
is_output_load = !is_input;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convert the 64-bit component mask to a 32-bit component mask. */
|
/* Convert the 64-bit component mask to a 32-bit component mask. */
|
||||||
|
@ -152,7 +154,15 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
|
||||||
info->output_semantic_name[loc] = name;
|
info->output_semantic_name[loc] = name;
|
||||||
info->output_semantic_index[loc] = index + i;
|
info->output_semantic_index[loc] = index + i;
|
||||||
|
|
||||||
if (slot_mask) {
|
if (is_output_load) {
|
||||||
|
/* Output loads have only a few things that we need to track. */
|
||||||
|
info->output_readmask[loc] |= slot_mask;
|
||||||
|
|
||||||
|
if (info->processor == PIPE_SHADER_FRAGMENT &&
|
||||||
|
nir_intrinsic_io_semantics(intr).fb_fetch_output)
|
||||||
|
info->uses_fbfetch = true;
|
||||||
|
} else if (slot_mask) {
|
||||||
|
/* Output stores. */
|
||||||
if (info->processor == PIPE_SHADER_GEOMETRY) {
|
if (info->processor == PIPE_SHADER_GEOMETRY) {
|
||||||
unsigned gs_streams = (uint32_t)nir_intrinsic_io_semantics(intr).gs_streams <<
|
unsigned gs_streams = (uint32_t)nir_intrinsic_io_semantics(intr).gs_streams <<
|
||||||
(nir_intrinsic_component(intr) * 2);
|
(nir_intrinsic_component(intr) * 2);
|
||||||
|
@ -418,28 +428,12 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info
|
||||||
case nir_intrinsic_load_interpolated_input:
|
case nir_intrinsic_load_interpolated_input:
|
||||||
scan_io_usage(info, intr, true);
|
scan_io_usage(info, intr, true);
|
||||||
break;
|
break;
|
||||||
|
case nir_intrinsic_load_output:
|
||||||
|
case nir_intrinsic_load_per_vertex_output:
|
||||||
case nir_intrinsic_store_output:
|
case nir_intrinsic_store_output:
|
||||||
case nir_intrinsic_store_per_vertex_output:
|
case nir_intrinsic_store_per_vertex_output:
|
||||||
scan_io_usage(info, intr, false);
|
scan_io_usage(info, intr, false);
|
||||||
break;
|
break;
|
||||||
case nir_intrinsic_load_output: {
|
|
||||||
unsigned location = nir_intrinsic_io_semantics(intr).location;
|
|
||||||
|
|
||||||
if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
|
|
||||||
if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
|
|
||||||
location == VARYING_SLOT_TESS_LEVEL_OUTER)
|
|
||||||
info->reads_tessfactor_outputs = true;
|
|
||||||
else
|
|
||||||
info->reads_perpatch_outputs = true;
|
|
||||||
} else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
|
|
||||||
if (nir_intrinsic_io_semantics(intr).fb_fetch_output)
|
|
||||||
info->uses_fbfetch = true;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case nir_intrinsic_load_per_vertex_output:
|
|
||||||
info->reads_pervertex_outputs = true;
|
|
||||||
break;
|
|
||||||
case nir_intrinsic_load_deref:
|
case nir_intrinsic_load_deref:
|
||||||
case nir_intrinsic_store_deref:
|
case nir_intrinsic_store_deref:
|
||||||
case nir_intrinsic_interp_deref_at_centroid:
|
case nir_intrinsic_interp_deref_at_centroid:
|
||||||
|
@ -576,6 +570,10 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Trim output read masks based on write masks. */
|
||||||
|
for (unsigned i = 0; i < info->num_outputs; i++)
|
||||||
|
info->output_readmask[i] &= info->output_usagemask[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void si_nir_opts(struct nir_shader *nir, bool first)
|
static void si_nir_opts(struct nir_shader *nir, bool first)
|
||||||
|
|
Loading…
Reference in New Issue