nir/opt_load_store_vectorize: improve handling of swizzles

Previously (for simplicity), it could have skipped vectorization if
swizzles were involved.

fossil-db (GFX10.3):
Totals from 498 (0.36% of 139391) affected shaders:
SGPRs: 25328 -> 26608 (+5.05%); split: -1.36%, +6.41%
VGPRs: 9988 -> 9996 (+0.08%)
SpillSGPRs: 40 -> 65 (+62.50%)
CodeSize: 1410188 -> 1385584 (-1.74%); split: -1.76%, +0.02%
Instrs: 257149 -> 250579 (-2.55%); split: -2.57%, +0.01%
Cycles: 1096892 -> 1070600 (-2.40%); split: -2.41%, +0.01%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10384>
This commit is contained in:
Rhys Perry 2020-09-23 16:17:12 +01:00 committed by Marge Bot
parent 4df3654c79
commit 6ca11b4a66
1 changed files with 55 additions and 51 deletions

View File

@ -159,7 +159,7 @@ struct entry_key {
nir_ssa_def *resource;
nir_variable *var;
unsigned offset_def_count;
nir_ssa_def **offset_defs;
nir_ssa_scalar *offset_defs;
uint64_t *offset_defs_mul;
};
@ -207,8 +207,10 @@ static uint32_t hash_entry_key(const void *key_)
hash = XXH32(&mode, sizeof(mode), hash);
}
for (unsigned i = 0; i < key->offset_def_count; i++)
hash = XXH32(&key->offset_defs[i]->index, sizeof(key->offset_defs[i]->index), hash);
for (unsigned i = 0; i < key->offset_def_count; i++) {
hash = XXH32(&key->offset_defs[i].def->index, sizeof(key->offset_defs[i].def->index), hash);
hash = XXH32(&key->offset_defs[i].comp, sizeof(key->offset_defs[i].comp), hash);
}
hash = XXH32(key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t), hash);
@ -226,11 +228,15 @@ static bool entry_key_equals(const void *a_, const void *b_)
if (a->offset_def_count != b->offset_def_count)
return false;
size_t offset_def_size = a->offset_def_count * sizeof(nir_ssa_def *);
for (unsigned i = 0; i < a->offset_def_count; i++) {
if (a->offset_defs[i].def != b->offset_defs[i].def ||
a->offset_defs[i].comp != b->offset_defs[i].comp)
return false;
}
size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
if (a->offset_def_count &&
(memcmp(a->offset_defs, b->offset_defs, offset_def_size) ||
memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size)))
memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size))
return false;
return true;
@ -268,23 +274,19 @@ get_bit_size(struct entry *entry)
* sources is a constant, update "def" to be the non-constant source, fill "c"
* with the constant and return true. */
static bool
parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
parse_alu(nir_ssa_scalar *def, nir_op op, uint64_t *c)
{
nir_ssa_scalar scalar;
scalar.def = *def;
scalar.comp = 0;
if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != op)
if (!nir_ssa_scalar_is_alu(*def) || nir_ssa_scalar_alu_op(*def) != op)
return false;
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0) && src1.comp == 0) {
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(*def, 0);
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(*def, 1);
if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0)) {
*c = nir_ssa_scalar_as_uint(src0);
*def = src1.def;
} else if (nir_ssa_scalar_is_const(src1) && src0.comp == 0) {
*def = src1;
} else if (nir_ssa_scalar_is_const(src1)) {
*c = nir_ssa_scalar_as_uint(src1);
*def = src0.def;
*def = src0;
} else {
return false;
}
@ -293,11 +295,11 @@ parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
/* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
static void
parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
parse_offset(nir_ssa_scalar *base, uint64_t *base_mul, uint64_t *offset)
{
if ((*base)->parent_instr->type == nir_instr_type_load_const) {
*offset = nir_src_comp_as_uint(nir_src_for_ssa(*base), 0);
*base = NULL;
if (nir_ssa_scalar_is_const(*base)) {
*offset = nir_ssa_scalar_as_uint(*base);
base->def = NULL;
return;
}
@ -316,6 +318,11 @@ parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
progress |= parse_alu(base, nir_op_iadd, &add2);
add += add2 * mul;
if (nir_ssa_scalar_is_alu(*base) && nir_ssa_scalar_alu_op(*base) == nir_op_mov) {
*base = nir_ssa_scalar_chase_alu_src(*base, 0);
progress = true;
}
} while (progress);
*base_mul = mul;
@ -337,22 +344,23 @@ mask_sign_extend(uint64_t val, unsigned bit_size)
}
static unsigned
add_to_entry_key(nir_ssa_def **offset_defs, uint64_t *offset_defs_mul,
unsigned offset_def_count, nir_ssa_def *def, uint64_t mul)
add_to_entry_key(nir_ssa_scalar *offset_defs, uint64_t *offset_defs_mul,
unsigned offset_def_count, nir_ssa_scalar def, uint64_t mul)
{
mul = mask_sign_extend(mul, def->bit_size);
mul = mask_sign_extend(mul, def.def->bit_size);
for (unsigned i = 0; i <= offset_def_count; i++) {
if (i == offset_def_count || def->index > offset_defs[i]->index) {
if (i == offset_def_count || def.def->index > offset_defs[i].def->index) {
/* insert before i */
memmove(offset_defs + i + 1, offset_defs + i,
(offset_def_count - i) * sizeof(nir_ssa_def *));
(offset_def_count - i) * sizeof(nir_ssa_scalar));
memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
(offset_def_count - i) * sizeof(uint64_t));
offset_defs[i] = def;
offset_defs_mul[i] = mul;
return 1;
} else if (def->index == offset_defs[i]->index) {
} else if (def.def == offset_defs[i].def &&
def.comp == offset_defs[i].comp) {
/* merge with offset_def at i */
offset_defs_mul[i] += mul;
return 0;
@ -372,12 +380,12 @@ create_entry_key_from_deref(void *mem_ctx,
while (path->path[path_len])
path_len++;
nir_ssa_def *offset_defs_stack[32];
nir_ssa_scalar offset_defs_stack[32];
uint64_t offset_defs_mul_stack[32];
nir_ssa_def **offset_defs = offset_defs_stack;
nir_ssa_scalar *offset_defs = offset_defs_stack;
uint64_t *offset_defs_mul = offset_defs_mul_stack;
if (path_len > 32) {
offset_defs = malloc(path_len * sizeof(nir_ssa_def *));
offset_defs = malloc(path_len * sizeof(nir_ssa_scalar));
offset_defs_mul = malloc(path_len * sizeof(uint64_t));
}
unsigned offset_def_count = 0;
@ -403,13 +411,13 @@ create_entry_key_from_deref(void *mem_ctx,
nir_ssa_def *index = deref->arr.index.ssa;
uint32_t stride = nir_deref_instr_array_stride(deref);
nir_ssa_def *base = index;
nir_ssa_scalar base = {.def=index, .comp=0};
uint64_t offset = 0, base_mul = 1;
parse_offset(&base, &base_mul, &offset);
offset = mask_sign_extend(offset, index->bit_size);
*offset_base += offset * stride;
if (base) {
if (base.def) {
offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
offset_def_count,
base, base_mul * stride);
@ -433,9 +441,9 @@ create_entry_key_from_deref(void *mem_ctx,
}
key->offset_def_count = offset_def_count;
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, offset_def_count);
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_scalar, offset_def_count);
key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_def *));
memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_scalar));
memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
if (offset_defs != offset_defs_stack)
@ -448,14 +456,14 @@ create_entry_key_from_deref(void *mem_ctx,
static unsigned
parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
nir_ssa_scalar base, uint64_t base_mul, uint64_t *offset)
{
uint64_t new_mul;
uint64_t new_offset;
parse_offset(&base, &new_mul, &new_offset);
*offset += new_offset * base_mul;
if (!base)
if (!base.def)
return 0;
base_mul *= new_mul;
@ -463,19 +471,14 @@ parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
assert(left >= 1);
if (left >= 2) {
nir_ssa_scalar scalar;
scalar.def = base;
scalar.comp = 0;
if (nir_ssa_scalar_is_alu(scalar) && nir_ssa_scalar_alu_op(scalar) == nir_op_iadd) {
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
if (src0.comp == 0 && src1.comp == 0) {
unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0.def, base_mul, offset);
amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1.def, base_mul, offset);
if (nir_ssa_scalar_is_alu(base) && nir_ssa_scalar_alu_op(base) == nir_op_iadd) {
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(base, 0);
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(base, 1);
unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0, base_mul, offset);
amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1, base_mul, offset);
return amount;
}
}
}
return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
}
@ -487,16 +490,17 @@ create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul
key->resource = NULL;
key->var = NULL;
if (base) {
nir_ssa_def *offset_defs[32];
nir_ssa_scalar offset_defs[32];
uint64_t offset_defs_mul[32];
key->offset_defs = offset_defs;
key->offset_defs_mul = offset_defs_mul;
key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, base, base_mul, offset);
nir_ssa_scalar scalar = {.def=base, .comp=0};
key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, scalar, base_mul, offset);
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, key->offset_def_count);
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_scalar, key->offset_def_count);
key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_def *));
memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_scalar));
memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
} else {
key->offset_def_count = 0;