nir/opt_load_store_vectorize: improve handling of swizzles
Previously (for simplicity), it could have skipped vectorization if swizzles were involved. fossil-db (GFX10.3): Totals from 498 (0.36% of 139391) affected shaders: SGPRs: 25328 -> 26608 (+5.05%); split: -1.36%, +6.41% VGPRs: 9988 -> 9996 (+0.08%) SpillSGPRs: 40 -> 65 (+62.50%) CodeSize: 1410188 -> 1385584 (-1.74%); split: -1.76%, +0.02% Instrs: 257149 -> 250579 (-2.55%); split: -2.57%, +0.01% Cycles: 1096892 -> 1070600 (-2.40%); split: -2.41%, +0.01% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Eric Anholt <eric@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10384>
This commit is contained in:
parent
4df3654c79
commit
6ca11b4a66
|
@ -159,7 +159,7 @@ struct entry_key {
|
||||||
nir_ssa_def *resource;
|
nir_ssa_def *resource;
|
||||||
nir_variable *var;
|
nir_variable *var;
|
||||||
unsigned offset_def_count;
|
unsigned offset_def_count;
|
||||||
nir_ssa_def **offset_defs;
|
nir_ssa_scalar *offset_defs;
|
||||||
uint64_t *offset_defs_mul;
|
uint64_t *offset_defs_mul;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -207,8 +207,10 @@ static uint32_t hash_entry_key(const void *key_)
|
||||||
hash = XXH32(&mode, sizeof(mode), hash);
|
hash = XXH32(&mode, sizeof(mode), hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned i = 0; i < key->offset_def_count; i++)
|
for (unsigned i = 0; i < key->offset_def_count; i++) {
|
||||||
hash = XXH32(&key->offset_defs[i]->index, sizeof(key->offset_defs[i]->index), hash);
|
hash = XXH32(&key->offset_defs[i].def->index, sizeof(key->offset_defs[i].def->index), hash);
|
||||||
|
hash = XXH32(&key->offset_defs[i].comp, sizeof(key->offset_defs[i].comp), hash);
|
||||||
|
}
|
||||||
|
|
||||||
hash = XXH32(key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t), hash);
|
hash = XXH32(key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t), hash);
|
||||||
|
|
||||||
|
@ -226,11 +228,15 @@ static bool entry_key_equals(const void *a_, const void *b_)
|
||||||
if (a->offset_def_count != b->offset_def_count)
|
if (a->offset_def_count != b->offset_def_count)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
size_t offset_def_size = a->offset_def_count * sizeof(nir_ssa_def *);
|
for (unsigned i = 0; i < a->offset_def_count; i++) {
|
||||||
|
if (a->offset_defs[i].def != b->offset_defs[i].def ||
|
||||||
|
a->offset_defs[i].comp != b->offset_defs[i].comp)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
|
size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
|
||||||
if (a->offset_def_count &&
|
if (a->offset_def_count &&
|
||||||
(memcmp(a->offset_defs, b->offset_defs, offset_def_size) ||
|
memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size))
|
||||||
memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size)))
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -268,23 +274,19 @@ get_bit_size(struct entry *entry)
|
||||||
* sources is a constant, update "def" to be the non-constant source, fill "c"
|
* sources is a constant, update "def" to be the non-constant source, fill "c"
|
||||||
* with the constant and return true. */
|
* with the constant and return true. */
|
||||||
static bool
|
static bool
|
||||||
parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
|
parse_alu(nir_ssa_scalar *def, nir_op op, uint64_t *c)
|
||||||
{
|
{
|
||||||
nir_ssa_scalar scalar;
|
if (!nir_ssa_scalar_is_alu(*def) || nir_ssa_scalar_alu_op(*def) != op)
|
||||||
scalar.def = *def;
|
|
||||||
scalar.comp = 0;
|
|
||||||
|
|
||||||
if (!nir_ssa_scalar_is_alu(scalar) || nir_ssa_scalar_alu_op(scalar) != op)
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
|
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(*def, 0);
|
||||||
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
|
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(*def, 1);
|
||||||
if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0) && src1.comp == 0) {
|
if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0)) {
|
||||||
*c = nir_ssa_scalar_as_uint(src0);
|
*c = nir_ssa_scalar_as_uint(src0);
|
||||||
*def = src1.def;
|
*def = src1;
|
||||||
} else if (nir_ssa_scalar_is_const(src1) && src0.comp == 0) {
|
} else if (nir_ssa_scalar_is_const(src1)) {
|
||||||
*c = nir_ssa_scalar_as_uint(src1);
|
*c = nir_ssa_scalar_as_uint(src1);
|
||||||
*def = src0.def;
|
*def = src0;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -293,11 +295,11 @@ parse_alu(nir_ssa_def **def, nir_op op, uint64_t *c)
|
||||||
|
|
||||||
/* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
|
/* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
|
||||||
static void
|
static void
|
||||||
parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
|
parse_offset(nir_ssa_scalar *base, uint64_t *base_mul, uint64_t *offset)
|
||||||
{
|
{
|
||||||
if ((*base)->parent_instr->type == nir_instr_type_load_const) {
|
if (nir_ssa_scalar_is_const(*base)) {
|
||||||
*offset = nir_src_comp_as_uint(nir_src_for_ssa(*base), 0);
|
*offset = nir_ssa_scalar_as_uint(*base);
|
||||||
*base = NULL;
|
base->def = NULL;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -316,6 +318,11 @@ parse_offset(nir_ssa_def **base, uint64_t *base_mul, uint64_t *offset)
|
||||||
|
|
||||||
progress |= parse_alu(base, nir_op_iadd, &add2);
|
progress |= parse_alu(base, nir_op_iadd, &add2);
|
||||||
add += add2 * mul;
|
add += add2 * mul;
|
||||||
|
|
||||||
|
if (nir_ssa_scalar_is_alu(*base) && nir_ssa_scalar_alu_op(*base) == nir_op_mov) {
|
||||||
|
*base = nir_ssa_scalar_chase_alu_src(*base, 0);
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
} while (progress);
|
} while (progress);
|
||||||
|
|
||||||
*base_mul = mul;
|
*base_mul = mul;
|
||||||
|
@ -337,22 +344,23 @@ mask_sign_extend(uint64_t val, unsigned bit_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned
|
static unsigned
|
||||||
add_to_entry_key(nir_ssa_def **offset_defs, uint64_t *offset_defs_mul,
|
add_to_entry_key(nir_ssa_scalar *offset_defs, uint64_t *offset_defs_mul,
|
||||||
unsigned offset_def_count, nir_ssa_def *def, uint64_t mul)
|
unsigned offset_def_count, nir_ssa_scalar def, uint64_t mul)
|
||||||
{
|
{
|
||||||
mul = mask_sign_extend(mul, def->bit_size);
|
mul = mask_sign_extend(mul, def.def->bit_size);
|
||||||
|
|
||||||
for (unsigned i = 0; i <= offset_def_count; i++) {
|
for (unsigned i = 0; i <= offset_def_count; i++) {
|
||||||
if (i == offset_def_count || def->index > offset_defs[i]->index) {
|
if (i == offset_def_count || def.def->index > offset_defs[i].def->index) {
|
||||||
/* insert before i */
|
/* insert before i */
|
||||||
memmove(offset_defs + i + 1, offset_defs + i,
|
memmove(offset_defs + i + 1, offset_defs + i,
|
||||||
(offset_def_count - i) * sizeof(nir_ssa_def *));
|
(offset_def_count - i) * sizeof(nir_ssa_scalar));
|
||||||
memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
|
memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
|
||||||
(offset_def_count - i) * sizeof(uint64_t));
|
(offset_def_count - i) * sizeof(uint64_t));
|
||||||
offset_defs[i] = def;
|
offset_defs[i] = def;
|
||||||
offset_defs_mul[i] = mul;
|
offset_defs_mul[i] = mul;
|
||||||
return 1;
|
return 1;
|
||||||
} else if (def->index == offset_defs[i]->index) {
|
} else if (def.def == offset_defs[i].def &&
|
||||||
|
def.comp == offset_defs[i].comp) {
|
||||||
/* merge with offset_def at i */
|
/* merge with offset_def at i */
|
||||||
offset_defs_mul[i] += mul;
|
offset_defs_mul[i] += mul;
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -372,12 +380,12 @@ create_entry_key_from_deref(void *mem_ctx,
|
||||||
while (path->path[path_len])
|
while (path->path[path_len])
|
||||||
path_len++;
|
path_len++;
|
||||||
|
|
||||||
nir_ssa_def *offset_defs_stack[32];
|
nir_ssa_scalar offset_defs_stack[32];
|
||||||
uint64_t offset_defs_mul_stack[32];
|
uint64_t offset_defs_mul_stack[32];
|
||||||
nir_ssa_def **offset_defs = offset_defs_stack;
|
nir_ssa_scalar *offset_defs = offset_defs_stack;
|
||||||
uint64_t *offset_defs_mul = offset_defs_mul_stack;
|
uint64_t *offset_defs_mul = offset_defs_mul_stack;
|
||||||
if (path_len > 32) {
|
if (path_len > 32) {
|
||||||
offset_defs = malloc(path_len * sizeof(nir_ssa_def *));
|
offset_defs = malloc(path_len * sizeof(nir_ssa_scalar));
|
||||||
offset_defs_mul = malloc(path_len * sizeof(uint64_t));
|
offset_defs_mul = malloc(path_len * sizeof(uint64_t));
|
||||||
}
|
}
|
||||||
unsigned offset_def_count = 0;
|
unsigned offset_def_count = 0;
|
||||||
|
@ -403,13 +411,13 @@ create_entry_key_from_deref(void *mem_ctx,
|
||||||
nir_ssa_def *index = deref->arr.index.ssa;
|
nir_ssa_def *index = deref->arr.index.ssa;
|
||||||
uint32_t stride = nir_deref_instr_array_stride(deref);
|
uint32_t stride = nir_deref_instr_array_stride(deref);
|
||||||
|
|
||||||
nir_ssa_def *base = index;
|
nir_ssa_scalar base = {.def=index, .comp=0};
|
||||||
uint64_t offset = 0, base_mul = 1;
|
uint64_t offset = 0, base_mul = 1;
|
||||||
parse_offset(&base, &base_mul, &offset);
|
parse_offset(&base, &base_mul, &offset);
|
||||||
offset = mask_sign_extend(offset, index->bit_size);
|
offset = mask_sign_extend(offset, index->bit_size);
|
||||||
|
|
||||||
*offset_base += offset * stride;
|
*offset_base += offset * stride;
|
||||||
if (base) {
|
if (base.def) {
|
||||||
offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
|
offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
|
||||||
offset_def_count,
|
offset_def_count,
|
||||||
base, base_mul * stride);
|
base, base_mul * stride);
|
||||||
|
@ -433,9 +441,9 @@ create_entry_key_from_deref(void *mem_ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
key->offset_def_count = offset_def_count;
|
key->offset_def_count = offset_def_count;
|
||||||
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, offset_def_count);
|
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_scalar, offset_def_count);
|
||||||
key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
|
key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
|
||||||
memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_def *));
|
memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_scalar));
|
||||||
memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
|
memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
|
||||||
|
|
||||||
if (offset_defs != offset_defs_stack)
|
if (offset_defs != offset_defs_stack)
|
||||||
|
@ -448,14 +456,14 @@ create_entry_key_from_deref(void *mem_ctx,
|
||||||
|
|
||||||
static unsigned
|
static unsigned
|
||||||
parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
|
parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
|
||||||
nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
|
nir_ssa_scalar base, uint64_t base_mul, uint64_t *offset)
|
||||||
{
|
{
|
||||||
uint64_t new_mul;
|
uint64_t new_mul;
|
||||||
uint64_t new_offset;
|
uint64_t new_offset;
|
||||||
parse_offset(&base, &new_mul, &new_offset);
|
parse_offset(&base, &new_mul, &new_offset);
|
||||||
*offset += new_offset * base_mul;
|
*offset += new_offset * base_mul;
|
||||||
|
|
||||||
if (!base)
|
if (!base.def)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
base_mul *= new_mul;
|
base_mul *= new_mul;
|
||||||
|
@ -463,19 +471,14 @@ parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
|
||||||
assert(left >= 1);
|
assert(left >= 1);
|
||||||
|
|
||||||
if (left >= 2) {
|
if (left >= 2) {
|
||||||
nir_ssa_scalar scalar;
|
if (nir_ssa_scalar_is_alu(base) && nir_ssa_scalar_alu_op(base) == nir_op_iadd) {
|
||||||
scalar.def = base;
|
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(base, 0);
|
||||||
scalar.comp = 0;
|
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(base, 1);
|
||||||
if (nir_ssa_scalar_is_alu(scalar) && nir_ssa_scalar_alu_op(scalar) == nir_op_iadd) {
|
unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0, base_mul, offset);
|
||||||
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
|
amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1, base_mul, offset);
|
||||||
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
|
|
||||||
if (src0.comp == 0 && src1.comp == 0) {
|
|
||||||
unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0.def, base_mul, offset);
|
|
||||||
amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1.def, base_mul, offset);
|
|
||||||
return amount;
|
return amount;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
|
return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
|
||||||
}
|
}
|
||||||
|
@ -487,16 +490,17 @@ create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul
|
||||||
key->resource = NULL;
|
key->resource = NULL;
|
||||||
key->var = NULL;
|
key->var = NULL;
|
||||||
if (base) {
|
if (base) {
|
||||||
nir_ssa_def *offset_defs[32];
|
nir_ssa_scalar offset_defs[32];
|
||||||
uint64_t offset_defs_mul[32];
|
uint64_t offset_defs_mul[32];
|
||||||
key->offset_defs = offset_defs;
|
key->offset_defs = offset_defs;
|
||||||
key->offset_defs_mul = offset_defs_mul;
|
key->offset_defs_mul = offset_defs_mul;
|
||||||
|
|
||||||
key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, base, base_mul, offset);
|
nir_ssa_scalar scalar = {.def=base, .comp=0};
|
||||||
|
key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, scalar, base_mul, offset);
|
||||||
|
|
||||||
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_def *, key->offset_def_count);
|
key->offset_defs = ralloc_array(mem_ctx, nir_ssa_scalar, key->offset_def_count);
|
||||||
key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
|
key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
|
||||||
memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_def *));
|
memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_scalar));
|
||||||
memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
|
memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
|
||||||
} else {
|
} else {
|
||||||
key->offset_def_count = 0;
|
key->offset_def_count = 0;
|
||||||
|
|
Loading…
Reference in New Issue