radeonsi: optimize out the loop in si_get_ps_input_cntl

Use a remap table from a semantic to an index instead of searching
for the correct index.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6340>
This commit is contained in:
Marek Olšák 2020-08-15 04:39:30 -04:00
parent 6ecb8b6899
commit 98e866c669
3 changed files with 37 additions and 33 deletions

View File

@ -327,6 +327,7 @@ struct si_shader_info {
ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS];
char output_semantic_to_slot[VARYING_SLOT_TESS_MAX];
ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];

View File

@ -143,12 +143,14 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
} else {
/* Outputs. */
assert(driver_location + num_slots <= ARRAY_SIZE(info->output_usagemask));
assert(semantic + num_slots < ARRAY_SIZE(info->output_semantic_to_slot));
for (unsigned i = 0; i < num_slots; i++) {
unsigned loc = driver_location + i;
unsigned slot_mask = (dual_slot && i % 2 ? mask >> 4 : mask) & 0xf;
info->output_semantic[loc] = semantic + i;
info->output_semantic_to_slot[semantic + i] = loc;
if (is_output_load) {
/* Output loads have only a few things that we need to track. */
@ -556,6 +558,8 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
info->tessfactors_are_def_in_all_invocs = ac_are_tessfactors_def_in_all_invocs(nir);
}
memset(info->output_semantic_to_slot, -1, sizeof(info->output_semantic_to_slot));
func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
nir_foreach_block (block, func->impl) {
nir_foreach_instr (instr, block)

View File

@ -3171,7 +3171,7 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *
unsigned semantic, enum glsl_interp_mode interpolate)
{
struct si_shader_info *vsinfo = &vs->selector->info;
unsigned j, offset, ps_input_cntl = 0;
unsigned offset, ps_input_cntl = 0;
if (interpolate == INTERP_MODE_FLAT ||
(interpolate == INTERP_MODE_COLOR && sctx->flatshade) ||
@ -3184,43 +3184,42 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, struct si_shader *
ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
}
/* TODO: This search can be removed if we add a lookup table from semantic to index. */
for (j = 0; j < vsinfo->num_outputs; j++) {
if (semantic == vsinfo->output_semantic[j]) {
offset = vs->info.vs_output_param_offset[j];
int vs_slot = vsinfo->output_semantic_to_slot[semantic];
if (vs_slot >= 0) {
offset = vs->info.vs_output_param_offset[vs_slot];
if (offset <= AC_EXP_PARAM_OFFSET_31) {
/* The input is loaded from parameter memory. */
ps_input_cntl |= S_028644_OFFSET(offset);
} else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
if (offset == AC_EXP_PARAM_UNDEFINED) {
/* This can happen with depth-only rendering. */
offset = 0;
} else {
/* The input is a DEFAULT_VAL constant. */
assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
}
ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
if (offset <= AC_EXP_PARAM_OFFSET_31) {
/* The input is loaded from parameter memory. */
ps_input_cntl |= S_028644_OFFSET(offset);
} else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
if (offset == AC_EXP_PARAM_UNDEFINED) {
/* This can happen with depth-only rendering. */
offset = 0;
} else {
/* The input is a DEFAULT_VAL constant. */
assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
}
break;
ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
}
} else {
/* VS output not found. */
if (semantic == VARYING_SLOT_PRIMITIVE_ID) {
/* PrimID is written after the last output when HW VS is used. */
ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
} else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
/* No corresponding output found, load defaults into input.
* Don't set any other bits.
* (FLAT_SHADE=1 completely changes behavior) */
ps_input_cntl = S_028644_OFFSET(0x20);
/* D3D 9 behaviour. GL is undefined */
if (semantic == VARYING_SLOT_COL0)
ps_input_cntl |= S_028644_DEFAULT_VAL(3);
}
}
if (j == vsinfo->num_outputs && semantic == VARYING_SLOT_PRIMITIVE_ID)
/* PrimID is written after the last output when HW VS is used. */
ps_input_cntl |= S_028644_OFFSET(vs->info.vs_output_param_offset[vsinfo->num_outputs]);
else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(ps_input_cntl)) {
/* No corresponding output found, load defaults into input.
* Don't set any other bits.
* (FLAT_SHADE=1 completely changes behavior) */
ps_input_cntl = S_028644_OFFSET(0x20);
/* D3D 9 behaviour. GL is undefined */
if (semantic == VARYING_SLOT_COL0)
ps_input_cntl |= S_028644_DEFAULT_VAL(3);
}
return ps_input_cntl;
}