freedreno/ir3: Replace our custom vec4 UBO intrinsic with the shared lowering.

This gets us fewer comparisons in the shaders that we need to optimize
back out, and reduces backend code.

total instructions in shared programs: 11547270 -> 7219930 (-37.48%)
total full in shared programs: 334268 -> 319602 (-4.39%)

Reviewed-by: Kristian H. Kristensen <hoegsberg@google.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6378>
This commit is contained in:
Eric Anholt 2020-08-18 15:45:02 -07:00
parent 73616598bd
commit 2b25240993
5 changed files with 7 additions and 91 deletions

View File

@ -815,12 +815,6 @@ intrinsic("ssbo_atomic_xor_ir3", src_comp=[1, 1, 1, 1], dest_comp=1, i
intrinsic("ssbo_atomic_exchange_ir3", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[ACCESS])
intrinsic("ssbo_atomic_comp_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1, indices=[ACCESS])
# IR3-specific instruction for UBO loads using the ldc instruction. The second
# source is the indirect offset, in units of four dwords. The base is a
# component offset, in dword units.
intrinsic("load_ubo_ir3", src_comp=[1, 1], bit_sizes=[32], dest_comp=0, indices=[BASE],
flags=[CAN_REORDER, CAN_ELIMINATE])
# System values for freedreno geometry shaders.
system_value("vs_primitive_stride_ir3", 1)
system_value("vs_vertex_stride_ir3", 1)

View File

@ -754,7 +754,7 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
ldc->regs[0]->wrmask = MASK(ncomp);
ldc->cat6.iim_val = ncomp;
ldc->cat6.d = nir_intrinsic_base(intr);
ldc->cat6.d = nir_intrinsic_component(intr);
ldc->cat6.type = TYPE_U32;
ir3_handle_bindless_cat6(ldc, intr->src[0]);
@ -1647,7 +1647,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_load_ubo:
emit_intrinsic_load_ubo(ctx, intr, dst);
break;
case nir_intrinsic_load_ubo_ir3:
case nir_intrinsic_load_ubo_vec4:
emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
break;
case nir_intrinsic_load_frag_coord:

View File

@ -461,6 +461,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
/* UBO offset lowering has to come after we've decided what will
* be left as load_ubo
*/
if (so->shader->compiler->gpu_id >= 600)
OPT_V(s, nir_lower_ubo_vec4);
OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);
if (progress)

View File

@ -327,8 +327,8 @@ instr_is_load_ubo(nir_instr *instr)
nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
/* ir3_nir_lower_io_offsets happens after this pass. */
assert(op != nir_intrinsic_load_ubo_ir3);
/* nir_lower_ubo_vec4 happens after this pass. */
assert(op != nir_intrinsic_load_ubo_vec4);
return op == nir_intrinsic_load_ubo;
}

View File

@ -253,81 +253,6 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
return true;
}
static bool
lower_offset_for_ubo(nir_intrinsic_instr *intrinsic, nir_builder *b, int gpu_id)
{
/* We only need to lower offset if using LDC, which takes an offset in
* vec4 units and has the start component baked into the instruction.
*/
if (gpu_id < 600)
return false;
/* TODO handle other bitsizes, including non-dword-aligned loads */
assert(intrinsic->dest.ssa.bit_size == 32);
b->cursor = nir_before_instr(&intrinsic->instr);
nir_intrinsic_instr *new_intrinsic =
nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_ir3);
debug_assert(intrinsic->dest.is_ssa);
new_intrinsic->src[0] = intrinsic->src[0];
nir_ssa_def *offset = intrinsic->src[1].ssa;
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -4);
if (!new_offset)
new_offset = nir_ushr(b, offset, nir_imm_int(b, 4));
new_intrinsic->src[1] = nir_src_for_ssa(new_offset);
unsigned align_mul = nir_intrinsic_align_mul(intrinsic);
unsigned align_offset = nir_intrinsic_align_offset(intrinsic);
unsigned components = intrinsic->num_components;
if (align_mul % 16 != 0)
components = 4;
new_intrinsic->num_components = components;
nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
components, 32, NULL);
nir_builder_instr_insert(b, &new_intrinsic->instr);
nir_ssa_def *new_dest;
if (align_mul % 16 == 0) {
/* We know that the low 4 bits of the offset are constant and equal to
* align_offset. Use the component offset.
*/
unsigned component = align_offset / 4;
nir_intrinsic_set_base(new_intrinsic, component);
new_dest = &new_intrinsic->dest.ssa;
} else {
/* We have to assume it isn't aligned, and extract the components
* dynamically.
*/
nir_intrinsic_set_base(new_intrinsic, 0);
nir_ssa_def *component =
nir_iand(b, nir_ushr(b, offset, nir_imm_int(b, 2)), nir_imm_int(b, 3));
nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
for (unsigned i = 0; i < intrinsic->num_components; i++) {
nir_ssa_def *idx = nir_iadd(b, nir_imm_int(b, i), component);
channels[i] = nir_vector_extract(b, &new_intrinsic->dest.ssa, idx);
}
new_dest = nir_vec(b, channels, intrinsic->num_components);
}
nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa,
nir_src_for_ssa(new_dest));
nir_instr_remove(&intrinsic->instr);
return true;
}
static bool
lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id)
{
@ -339,12 +264,6 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
/* UBO */
if (intr->intrinsic == nir_intrinsic_load_ubo) {
progress |= lower_offset_for_ubo(intr, b, gpu_id);
continue;
}
/* SSBO */
int ir3_intrinsic;
uint8_t offset_src_idx;