From 81d35c8d48508e1d28724755af28a6c7572516e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 23 Oct 2021 23:23:15 -0400 Subject: [PATCH] util: add a util_bitcount variant that selects POPCNT through C++ template arg Moved from radeonsi. st/mesa will use it. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_state_draw.cpp | 29 ++++--------------- src/util/bitscan.h | 24 ++++++++++++++- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 62efba1cc70..ecaa6cec016 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1728,41 +1728,24 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex #endif -/* util_bitcount has large measurable overhead (~2% difference in viewperf), so we use - * the POPCNT x86 instruction via inline assembly if the CPU supports it. - */ -enum si_has_popcnt { - POPCNT_NO, - POPCNT_YES, -}; - -template -unsigned bitcount_asm(unsigned n) -{ - if (POPCNT == POPCNT_YES) - return util_popcnt_inline_asm(n); - else - return util_bitcount(n); -} - -template +template static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state, uint32_t *partial_velem_mask) { unsigned semantic_index = u_bit_scan(partial_velem_mask); assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index)); /* A prefix mask of the full mask gives us the index in pipe_vertex_state. */ - return bitcount_asm(state->input.full_velem_mask & BITFIELD_MASK(semantic_index)); + return util_bitcount_fast(state->input.full_velem_mask & BITFIELD_MASK(semantic_index)); } template ALWAYS_INLINE + si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx, struct pipe_vertex_state *state, uint32_t partial_velem_mask) { struct si_vertex_state *vstate = (struct si_vertex_state *)state; - unsigned count = IS_DRAW_VERTEX_STATE ? bitcount_asm(partial_velem_mask) : + unsigned count = IS_DRAW_VERTEX_STATE ? util_bitcount_fast(partial_velem_mask) : sctx->num_vertex_elements; unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, PIPE_SHADER_VERTEX); @@ -2031,7 +2014,7 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i } while (0) template ALWAYS_INLINE + si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE static void si_draw(struct pipe_context *ctx, const struct pipe_draw_info *info, unsigned drawid_offset, @@ -2501,7 +2484,7 @@ static void si_draw_vbo(struct pipe_context *ctx, } template + util_popcnt POPCNT> static void si_draw_vertex_state(struct pipe_context *ctx, struct pipe_vertex_state *vstate, uint32_t partial_velem_mask, diff --git a/src/util/bitscan.h b/src/util/bitscan.h index 105b7ba3122..82b1bb5a1dd 100644 --- a/src/util/bitscan.h +++ b/src/util/bitscan.h @@ -351,6 +351,28 @@ util_bitcount64(uint64_t n) #ifdef __cplusplus } -#endif + +/* util_bitcount has large measurable overhead (~2%), so it's recommended to + * use the POPCNT instruction via inline assembly if the CPU supports it. + */ +enum util_popcnt { + POPCNT_NO, + POPCNT_YES, +}; + +/* Convenient function to select popcnt through a C++ template argument. + * This should be used as part of larger functions that are optimized + * as a whole. + */ +template inline unsigned +util_bitcount_fast(unsigned n) +{ + if (POPCNT == POPCNT_YES) + return util_popcnt_inline_asm(n); + else + return util_bitcount(n); +} + +#endif /* __cplusplus */ #endif /* BITSCAN_H */