From 0c96b03fcf90ad3167e156068a5662feed7b7e19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Tue, 2 Apr 2024 10:25:29 +0200 Subject: [PATCH] r300: better packing for immediates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit How this works? First we check which immediates are used as vectors, i.e., have any reads that are using 2 or more channels. Such immdeiates will be places in a free slots (but only the specific channels that are used in the vector). This way we don't have to worry about swizzling restrictions. The remaining scalar immediates will be checked for duplicates and placed in free slots, including any empty slots in previously places vector immediates (any swizzle is valid for scalars). RV410: total instructions in shared programs: 98883 -> 98905 (0.02%) instructions in affected programs: 15414 -> 15436 (0.14%) helped: 100 HURT: 102 total presub in shared programs: 2235 -> 2235 (0.00%) presub in affected programs: 608 -> 608 (0.00%) helped: 51 HURT: 72 total omod in shared programs: 419 -> 418 (-0.24%) omod in affected programs: 15 -> 14 (-6.67%) helped: 3 HURT: 3 total temps in shared programs: 15698 -> 15692 (-0.04%) temps in affected programs: 952 -> 946 (-0.63%) helped: 46 HURT: 37 total consts in shared programs: 84458 -> 83856 (-0.71%) consts in affected programs: 14648 -> 14046 (-4.11%) helped: 499 HURT: 0 total cycles in shared programs: 156476 -> 156493 (0.01%) cycles in affected programs: 22532 -> 22549 (0.08%) helped: 100 HURT: 102 LOST: shaders/ck2/157.shader_test FS GAINED: shaders/ck2/160.shader_test FS GAINED: shaders/tesseract/395.shader_test FS RV530: total instructions in shared programs: 119543 -> 119612 (0.06%) instructions in affected programs: 27435 -> 27504 (0.25%) helped: 118 HURT: 183 total presub in shared programs: 7257 -> 7111 (-2.01%) presub in affected programs: 1856 -> 1710 (-7.87%) helped: 121 HURT: 48 total omod in shared programs: 426 -> 427 (0.23%) omod in affected programs: 5 -> 6 (20.00%) helped: 1 HURT: 2 total temps in shared programs: 16784 -> 16779 (-0.03%) temps in affected programs: 392 -> 387 (-1.28%) helped: 29 HURT: 17 total consts in shared programs: 93198 -> 92667 (-0.57%) consts in affected programs: 14577 -> 14046 (-3.64%) helped: 451 HURT: 0 total cycles in shared programs: 186649 -> 186590 (-0.03%) cycles in affected programs: 26306 -> 26247 (-0.22%) helped: 125 HURT: 111 Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Part-of: --- .../r300/compiler/radeon_remove_constants.c | 66 ++++++++++++++++++- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/radeon_remove_constants.c b/src/gallium/drivers/r300/compiler/radeon_remove_constants.c index ef65925271496..dc3e782e8add1 100644 --- a/src/gallium/drivers/r300/compiler/radeon_remove_constants.c +++ b/src/gallium/drivers/r300/compiler/radeon_remove_constants.c @@ -18,6 +18,8 @@ struct const_remap_state { struct rc_constant *constants; /* New constant layout. */ struct rc_constant_list new_constants; + /* Marks immediates that are used as a vector. Those will be just copied. */ + uint8_t *is_used_as_vector; bool has_rel_addr; bool are_externals_remapped; bool is_identity; @@ -48,6 +50,7 @@ static void mark_used(void * userdata, struct rc_instruction * inst, struct const_remap_state* d = userdata; if (src->File == RC_FILE_CONSTANT) { + uint8_t mask = 0; if (src->RelAddr) { d->has_rel_addr = true; } else { @@ -55,9 +58,14 @@ static void mark_used(void * userdata, struct rc_instruction * inst, char swz = GET_SWZ(src->Swizzle, chan); if (swz > RC_SWIZZLE_W) continue; - d->constants[src->Index].UseMask |= 1 << swz; + mask |= 1 << swz; } } + d->constants[src->Index].UseMask |= mask; + if (d->constants[src->Index].Type == RC_CONSTANT_IMMEDIATE && + util_bitcount(mask) > 1) { + d->is_used_as_vector[src->Index] |= mask; + } } } @@ -82,6 +90,26 @@ static void place_constant_in_free_slot(struct const_remap_state *s, unsigned i) s->new_constants.Count++; } +static void place_immediate_in_free_slot(struct const_remap_state *s, unsigned i) +{ + assert(util_bitcount(s->is_used_as_vector[i]) > 1); + + unsigned count = s->new_constants.Count; + + s->new_constants.Constants[count] = s->constants[i]; + s->new_constants.Constants[count].UseMask = s->is_used_as_vector[i]; + for (unsigned chan = 0; chan < 4; chan++) { + if (s->constants[i].UseMask & 1 << chan & s->is_used_as_vector[i]) { + s->inv_remap_table[i].index[chan] = count; + s->inv_remap_table[i].swizzle[chan] = chan; + } + } + if (count != i) { + s->is_identity = false; + } + s->new_constants.Count++; +} + static void try_merge_constants_external(struct const_remap_state *s, unsigned i) { assert(util_bitcount(s->constants[i].UseMask) == 1); @@ -110,10 +138,12 @@ static void try_merge_constants_external(struct const_remap_state *s, unsigned i static void init_constant_remap_state(struct radeon_compiler *c, struct const_remap_state *s) { s->is_identity = true; + s->is_used_as_vector = malloc(c->Program.Constants.Count); s->new_constants.Constants = malloc(sizeof(struct rc_constant) * c->Program.Constants.Count); s->new_constants._Reserved = c->Program.Constants.Count; s->constants = c->Program.Constants.Constants; + memset(s->is_used_as_vector, 0, c->Program.Constants.Count); s->remap_table = malloc(c->Program.Constants.Count * sizeof(struct const_remap)); s->inv_remap_table = @@ -179,9 +209,39 @@ void rc_remove_unused_constants(struct radeon_compiler *c, void *user) try_merge_constants_external(s, i); } - /* Now put the immediates and state constants. */ + /* Now put immediates which are used as vectors. */ for (unsigned i = 0; i < c->Program.Constants.Count; i++) { - if (constants[i].Type == RC_CONSTANT_EXTERNAL) + if (constants[i].Type == RC_CONSTANT_IMMEDIATE && + util_bitcount(s->constants[i].UseMask) > 0 && + util_bitcount(s->is_used_as_vector[i]) > 0) { + place_immediate_in_free_slot(s, i); + } + } + + /* Now walk over scalar immediates and try to: + * a) check for duplicates, + * b) find free slot. + * All of this is already done by rc_constants_add_immediate_scalar, + * so just use it. + */ + for (unsigned i = 0; i < c->Program.Constants.Count; i++) { + if (constants[i].Type != RC_CONSTANT_IMMEDIATE) + continue; + for (unsigned chan = 0; chan < 4; chan++) { + if ((s->constants[i].UseMask) & (1 << chan) && + (~(s->is_used_as_vector[i]) & (1 << chan))) { + unsigned swz; + s->inv_remap_table[i].index[chan] = + rc_constants_add_immediate_scalar(&s->new_constants, constants[i].u.Immediate[chan], &swz); + s->inv_remap_table[i].swizzle[chan] = GET_SWZ(swz, 0); + s->is_identity = false; + } + } + } + + /* Finally place state constants. */ + for (unsigned i = 0; i < c->Program.Constants.Count; i++) { + if (constants[i].Type != RC_CONSTANT_STATE) continue; if (util_bitcount(s->constants[i].UseMask) > 0) { place_constant_in_free_slot(s, i);