swr: fix _BitScanForward64 on unix

it must apply to 64 bits types, and use the ctzll intrinsic instead of ctz

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Krzysztof Raszkowski <krzysztof.raszkowski@intel.com>
Reviewed-by: Jan Zielinski <jan.zielinski@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6705>
This commit is contained in:
Michel Zou 2020-09-15 21:08:06 +02:00 committed by Marge Bot
parent 82c49a66c0
commit 12b8ad8f21
15 changed files with 37 additions and 40 deletions

View File

@ -70,7 +70,7 @@ UINT pdep_u32(UINT a, UINT mask)
// copied from http://wm.ite.pl/articles/pdep-soft-emu.html
// using bsf instead of funky loop
DWORD maskIndex;
unsigned long maskIndex = 0;
while (_BitScanForward(&maskIndex, mask))
{
// 1. isolate lowest set bit of mask
@ -100,7 +100,7 @@ UINT pext_u32(UINT a, UINT mask)
return _pext_u32(a, mask);
#else
UINT result = 0;
DWORD maskIndex;
unsigned long maskIndex;
uint32_t currentBit = 0;
while (_BitScanForward(&maskIndex, mask))
{

View File

@ -224,33 +224,30 @@ static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
#endif
#endif
inline unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask)
inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
{
*Index = __builtin_ctzll(Mask);
return (Mask != 0);
}
inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
{
*Index = __builtin_ctz(Mask);
return (Mask != 0);
}
inline unsigned char _BitScanForward(unsigned int* Index, unsigned int Mask)
inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
{
*Index = __builtin_ctz(Mask);
*Index = 63 - __builtin_clzll(Mask);
return (Mask != 0);
}
inline unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask)
{
*Index = 63 - __builtin_clz(Mask);
return (Mask != 0);
}
inline unsigned char _BitScanReverse(unsigned int* Index, unsigned int Mask)
inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
{
*Index = 31 - __builtin_clz(Mask);
return (Mask != 0);
}
#define _BitScanForward64 _BitScanForward
#define _BitScanReverse64 _BitScanReverse
inline void* AlignedMalloc(size_t size, size_t alignment)
{
void* ret;

View File

@ -469,7 +469,7 @@ static SIMDINLINE Float SIMDCALL
uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult = old;
float* pResult = (float*)&vResult;
DWORD index;
unsigned long index;
uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask))
{

View File

@ -635,7 +635,7 @@ static SIMDINLINE Float SIMDCALL
uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult = old;
float* pResult = (float*)&vResult;
DWORD index;
unsigned long index = 0;
uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask))
{

View File

@ -987,7 +987,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
streamMasks |= pState->state.soState.streamMasks[i];
}
DWORD maxAttrib;
unsigned long maxAttrib;
if (_BitScanReverse64(&maxAttrib, streamMasks))
{
pState->state.feNumAttributes =
@ -1027,7 +1027,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
// Disable hottile for surfaces with no writes
if (psState.pfnPixelShader != nullptr)
{
DWORD rt;
unsigned long rt;
uint32_t rtMask = pState->state.psState.renderTargetMask;
while (_BitScanForward(&rt, rtMask))
{

View File

@ -609,7 +609,7 @@ inline void SetupRenderBuffers(uint8_t* pColorBuffer[SWR_NUM_RENDERT
uint32_t colorHotTileMask,
RenderOutputBuffers& renderBuffers)
{
DWORD index;
unsigned long index;
while (_BitScanForward(&index, colorHotTileMask))
{
assert(index < SWR_NUM_RENDERTARGETS);
@ -937,7 +937,7 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
simdvector blendSrc;
simdvector blendOut;
DWORD rt;
unsigned long rt;
while (_BitScanForward(&rt, renderTargetMask))
{
renderTargetMask &= ~(1 << rt);
@ -1250,7 +1250,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
if (useAlternateOffset)
{
DWORD rt;
unsigned long rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{

View File

@ -302,7 +302,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
if (useAlternateOffset)
{
DWORD rt;
unsigned long rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{

View File

@ -285,7 +285,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
if (useAlternateOffset)
{
DWORD rt;
unsigned long rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{

View File

@ -179,7 +179,7 @@ INLINE void ProcessAttributes(
uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
if (mask)
{
DWORD comp;
unsigned long comp;
while (_BitScanForward(&comp, mask))
{
mask &= ~(1 << comp);
@ -245,7 +245,7 @@ void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
float* pRecipW,
float* pUserClipBuffer)
{
DWORD clipDist;
unsigned long clipDist;
uint32_t clipDistMask = state.clipDistanceMask;
while (_BitScanForward(&clipDist, clipDistMask))
{
@ -1122,7 +1122,7 @@ endBinTriangles:
TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
// scan remaining valid triangles and bin each separately
while (_BitScanForward((DWORD*)&triIndex, triMask))
while (_BitScanForward((unsigned long*)&triIndex, triMask))
{
uint32_t linkageCount = state.backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4;
@ -1363,7 +1363,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
// scan remaining valid triangles and bin each separately
while (_BitScanForward((DWORD*)&primIndex, primMask))
while (_BitScanForward((unsigned long*)&primIndex, primMask))
{
uint32_t linkageCount = backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4;
@ -1519,7 +1519,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
// scan remaining valid prims and bin each separately
const SWR_BACKEND_STATE& backendState = state.backendState;
uint32_t primIndex;
while (_BitScanForward((DWORD*)&primIndex, primMask))
while (_BitScanForward((unsigned long*)&primIndex, primMask))
{
uint32_t linkageCount = backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4;
@ -1818,8 +1818,8 @@ void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC,
TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
// scan remaining valid prims and bin each separately
uint32_t primIndex;
while (_BitScanForward((DWORD*)&primIndex, primMask))
unsigned long primIndex;
while (_BitScanForward(&primIndex, primMask))
{
uint32_t linkageCount = state.backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4;

View File

@ -409,7 +409,7 @@ public:
pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
DWORD index;
unsigned long index;
while (_BitScanForward(&index, cullMask))
{
cullMask &= ~(1 << index);
@ -881,7 +881,7 @@ private:
const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
const float* pSrc = reinterpret_cast<const float*>(&vSrc);
uint32_t mask = SIMD_T::movemask_ps(vMask);
DWORD lane;
unsigned long lane;
while (_BitScanForward(&lane, mask))
{
mask &= ~(1 << lane);

View File

@ -534,7 +534,7 @@ static void StreamOut(
for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
{
DWORD slot = 0;
unsigned long slot = 0;
uint64_t soMask = soState.streamMasks[streamIndex];
// Write all entries into primitive data buffer for SOS.

View File

@ -382,7 +382,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// overwrite texcoord for point sprites
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
DWORD texCoordAttrib = 0;
unsigned long texCoordAttrib = 0;
while (_BitScanForward(&texCoordAttrib, texCoordMask))
{
@ -424,7 +424,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
if (isPointSpriteTexCoordEnabled)
{
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
DWORD texCoordAttrib = 0;
unsigned long texCoordAttrib = 0;
while (_BitScanForward(&texCoordAttrib, texCoordMask))
{

View File

@ -1511,7 +1511,7 @@ void GetRenderHotTiles(DRAW_CONTEXT* pDC,
template <typename RT>
INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
{
DWORD rt = 0;
unsigned long rt = 0;
while (_BitScanForward(&rt, colorHotTileMask))
{
colorHotTileMask &= ~(1 << rt);
@ -1527,7 +1527,7 @@ INLINE void StepRasterTileY(uint32_t colorHotTileMask,
RenderOutputBuffers& buffers,
RenderOutputBuffers& startBufferRow)
{
DWORD rt = 0;
unsigned long rt = 0;
while (_BitScanForward(&rt, colorHotTileMask))
{
colorHotTileMask &= ~(1 << rt);

View File

@ -39,7 +39,7 @@ extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256
SIMD256::store_ps(src, vSrc);
SIMD256::store_si((SIMD256::Integer*)indices, vIndices);
DWORD index;
unsigned long index;
while (_BitScanForward(&index, mask))
{
mask &= ~(1 << index);

View File

@ -91,7 +91,7 @@ struct StreamOutJit : public BuilderGfxMem
Value* PackMask(uint32_t bitmask)
{
std::vector<Constant*> indices(4, C(0));
DWORD index;
unsigned long index;
uint32_t elem = 0;
while (_BitScanForward(&index, bitmask))
{