util: add a util_bitcount variant that selects POPCNT through C++ template arg

Moved from radeonsi. st/mesa will use it.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13512>
This commit is contained in:
Marek Olšák 2021-10-23 23:23:15 -04:00 committed by Marge Bot
parent e1c640c3a4
commit 81d35c8d48
2 changed files with 29 additions and 24 deletions

View File

@ -1728,41 +1728,24 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex
#endif
/* util_bitcount has large measurable overhead (~2% difference in viewperf), so we use
* the POPCNT x86 instruction via inline assembly if the CPU supports it.
*/
enum si_has_popcnt {
POPCNT_NO,
POPCNT_YES,
};
template<si_has_popcnt POPCNT>
unsigned bitcount_asm(unsigned n)
{
if (POPCNT == POPCNT_YES)
return util_popcnt_inline_asm(n);
else
return util_bitcount(n);
}
template<si_has_popcnt POPCNT>
template<util_popcnt POPCNT>
static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state,
uint32_t *partial_velem_mask)
{
unsigned semantic_index = u_bit_scan(partial_velem_mask);
assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index));
/* A prefix mask of the full mask gives us the index in pipe_vertex_state. */
return bitcount_asm<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
return util_bitcount_fast<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
struct pipe_vertex_state *state,
uint32_t partial_velem_mask)
{
struct si_vertex_state *vstate = (struct si_vertex_state *)state;
unsigned count = IS_DRAW_VERTEX_STATE ? bitcount_asm<POPCNT>(partial_velem_mask) :
unsigned count = IS_DRAW_VERTEX_STATE ? util_bitcount_fast<POPCNT>(partial_velem_mask) :
sctx->num_vertex_elements;
unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
PIPE_SHADER_VERTEX);
@ -2031,7 +2014,7 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
} while (0)
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
static void si_draw(struct pipe_context *ctx,
const struct pipe_draw_info *info,
unsigned drawid_offset,
@ -2501,7 +2484,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
si_has_popcnt POPCNT>
util_popcnt POPCNT>
static void si_draw_vertex_state(struct pipe_context *ctx,
struct pipe_vertex_state *vstate,
uint32_t partial_velem_mask,

View File

@ -351,6 +351,28 @@ util_bitcount64(uint64_t n)
#ifdef __cplusplus
}
#endif
/* util_bitcount has large measurable overhead (~2%), so it's recommended to
* use the POPCNT instruction via inline assembly if the CPU supports it.
*/
enum util_popcnt {
POPCNT_NO,
POPCNT_YES,
};
/* Convenient function to select popcnt through a C++ template argument.
* This should be used as part of larger functions that are optimized
* as a whole.
*/
template<util_popcnt POPCNT> inline unsigned
util_bitcount_fast(unsigned n)
{
if (POPCNT == POPCNT_YES)
return util_popcnt_inline_asm(n);
else
return util_bitcount(n);
}
#endif /* __cplusplus */
#endif /* BITSCAN_H */