swr: fix _BitScanForward64 on unix

it must apply to 64 bits types, and use the ctzll intrinsic instead of ctz

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Krzysztof Raszkowski <krzysztof.raszkowski@intel.com>
Reviewed-by: Jan Zielinski <jan.zielinski@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6705>
This commit is contained in:
Michel Zou 2020-09-15 21:08:06 +02:00 committed by Marge Bot
parent 82c49a66c0
commit 12b8ad8f21
15 changed files with 37 additions and 40 deletions

View File

@ -70,7 +70,7 @@ UINT pdep_u32(UINT a, UINT mask)
// copied from http://wm.ite.pl/articles/pdep-soft-emu.html // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
// using bsf instead of funky loop // using bsf instead of funky loop
DWORD maskIndex; unsigned long maskIndex = 0;
while (_BitScanForward(&maskIndex, mask)) while (_BitScanForward(&maskIndex, mask))
{ {
// 1. isolate lowest set bit of mask // 1. isolate lowest set bit of mask
@ -100,7 +100,7 @@ UINT pext_u32(UINT a, UINT mask)
return _pext_u32(a, mask); return _pext_u32(a, mask);
#else #else
UINT result = 0; UINT result = 0;
DWORD maskIndex; unsigned long maskIndex;
uint32_t currentBit = 0; uint32_t currentBit = 0;
while (_BitScanForward(&maskIndex, mask)) while (_BitScanForward(&maskIndex, mask))
{ {

View File

@ -224,33 +224,30 @@ static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
#endif #endif
#endif #endif
inline unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask) inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
{
*Index = __builtin_ctzll(Mask);
return (Mask != 0);
}
inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
{ {
*Index = __builtin_ctz(Mask); *Index = __builtin_ctz(Mask);
return (Mask != 0); return (Mask != 0);
} }
inline unsigned char _BitScanForward(unsigned int* Index, unsigned int Mask) inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
{ {
*Index = __builtin_ctz(Mask); *Index = 63 - __builtin_clzll(Mask);
return (Mask != 0); return (Mask != 0);
} }
inline unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask) inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
{
*Index = 63 - __builtin_clz(Mask);
return (Mask != 0);
}
inline unsigned char _BitScanReverse(unsigned int* Index, unsigned int Mask)
{ {
*Index = 31 - __builtin_clz(Mask); *Index = 31 - __builtin_clz(Mask);
return (Mask != 0); return (Mask != 0);
} }
#define _BitScanForward64 _BitScanForward
#define _BitScanReverse64 _BitScanReverse
inline void* AlignedMalloc(size_t size, size_t alignment) inline void* AlignedMalloc(size_t size, size_t alignment)
{ {
void* ret; void* ret;

View File

@ -469,7 +469,7 @@ static SIMDINLINE Float SIMDCALL
uint32_t* pOffsets = (uint32_t*)&idx; uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult = old; Float vResult = old;
float* pResult = (float*)&vResult; float* pResult = (float*)&vResult;
DWORD index; unsigned long index;
uint32_t umask = movemask_ps(mask); uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask)) while (_BitScanForward(&index, umask))
{ {

View File

@ -635,7 +635,7 @@ static SIMDINLINE Float SIMDCALL
uint32_t* pOffsets = (uint32_t*)&idx; uint32_t* pOffsets = (uint32_t*)&idx;
Float vResult = old; Float vResult = old;
float* pResult = (float*)&vResult; float* pResult = (float*)&vResult;
DWORD index; unsigned long index = 0;
uint32_t umask = movemask_ps(mask); uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask)) while (_BitScanForward(&index, umask))
{ {

View File

@ -987,7 +987,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
streamMasks |= pState->state.soState.streamMasks[i]; streamMasks |= pState->state.soState.streamMasks[i];
} }
DWORD maxAttrib; unsigned long maxAttrib;
if (_BitScanReverse64(&maxAttrib, streamMasks)) if (_BitScanReverse64(&maxAttrib, streamMasks))
{ {
pState->state.feNumAttributes = pState->state.feNumAttributes =
@ -1027,7 +1027,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
// Disable hottile for surfaces with no writes // Disable hottile for surfaces with no writes
if (psState.pfnPixelShader != nullptr) if (psState.pfnPixelShader != nullptr)
{ {
DWORD rt; unsigned long rt;
uint32_t rtMask = pState->state.psState.renderTargetMask; uint32_t rtMask = pState->state.psState.renderTargetMask;
while (_BitScanForward(&rt, rtMask)) while (_BitScanForward(&rt, rtMask))
{ {

View File

@ -609,7 +609,7 @@ inline void SetupRenderBuffers(uint8_t* pColorBuffer[SWR_NUM_RENDERT
uint32_t colorHotTileMask, uint32_t colorHotTileMask,
RenderOutputBuffers& renderBuffers) RenderOutputBuffers& renderBuffers)
{ {
DWORD index; unsigned long index;
while (_BitScanForward(&index, colorHotTileMask)) while (_BitScanForward(&index, colorHotTileMask))
{ {
assert(index < SWR_NUM_RENDERTARGETS); assert(index < SWR_NUM_RENDERTARGETS);
@ -937,7 +937,7 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
simdvector blendSrc; simdvector blendSrc;
simdvector blendOut; simdvector blendOut;
DWORD rt; unsigned long rt;
while (_BitScanForward(&rt, renderTargetMask)) while (_BitScanForward(&rt, renderTargetMask))
{ {
renderTargetMask &= ~(1 << rt); renderTargetMask &= ~(1 << rt);
@ -1250,7 +1250,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
if (useAlternateOffset) if (useAlternateOffset)
{ {
DWORD rt; unsigned long rt;
uint32_t rtMask = state.colorHottileEnable; uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask)) while (_BitScanForward(&rt, rtMask))
{ {

View File

@ -302,7 +302,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
if (useAlternateOffset) if (useAlternateOffset)
{ {
DWORD rt; unsigned long rt;
uint32_t rtMask = state.colorHottileEnable; uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask)) while (_BitScanForward(&rt, rtMask))
{ {

View File

@ -285,7 +285,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
if (useAlternateOffset) if (useAlternateOffset)
{ {
DWORD rt; unsigned long rt;
uint32_t rtMask = state.colorHottileEnable; uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask)) while (_BitScanForward(&rt, rtMask))
{ {

View File

@ -179,7 +179,7 @@ INLINE void ProcessAttributes(
uint32_t mask = backendState.swizzleMap[i].componentOverrideMask; uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
if (mask) if (mask)
{ {
DWORD comp; unsigned long comp;
while (_BitScanForward(&comp, mask)) while (_BitScanForward(&comp, mask))
{ {
mask &= ~(1 << comp); mask &= ~(1 << comp);
@ -245,7 +245,7 @@ void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
float* pRecipW, float* pRecipW,
float* pUserClipBuffer) float* pUserClipBuffer)
{ {
DWORD clipDist; unsigned long clipDist;
uint32_t clipDistMask = state.clipDistanceMask; uint32_t clipDistMask = state.clipDistanceMask;
while (_BitScanForward(&clipDist, clipDistMask)) while (_BitScanForward(&clipDist, clipDistMask))
{ {
@ -1122,7 +1122,7 @@ endBinTriangles:
TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2); TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
// scan remaining valid triangles and bin each separately // scan remaining valid triangles and bin each separately
while (_BitScanForward((DWORD*)&triIndex, triMask)) while (_BitScanForward((unsigned long*)&triIndex, triMask))
{ {
uint32_t linkageCount = state.backendState.numAttributes; uint32_t linkageCount = state.backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4; uint32_t numScalarAttribs = linkageCount * 4;
@ -1363,7 +1363,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
// scan remaining valid triangles and bin each separately // scan remaining valid triangles and bin each separately
while (_BitScanForward((DWORD*)&primIndex, primMask)) while (_BitScanForward((unsigned long*)&primIndex, primMask))
{ {
uint32_t linkageCount = backendState.numAttributes; uint32_t linkageCount = backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4; uint32_t numScalarAttribs = linkageCount * 4;
@ -1519,7 +1519,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
// scan remaining valid prims and bin each separately // scan remaining valid prims and bin each separately
const SWR_BACKEND_STATE& backendState = state.backendState; const SWR_BACKEND_STATE& backendState = state.backendState;
uint32_t primIndex; uint32_t primIndex;
while (_BitScanForward((DWORD*)&primIndex, primMask)) while (_BitScanForward((unsigned long*)&primIndex, primMask))
{ {
uint32_t linkageCount = backendState.numAttributes; uint32_t linkageCount = backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4; uint32_t numScalarAttribs = linkageCount * 4;
@ -1818,8 +1818,8 @@ void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC,
TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps()); TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
// scan remaining valid prims and bin each separately // scan remaining valid prims and bin each separately
uint32_t primIndex; unsigned long primIndex;
while (_BitScanForward((DWORD*)&primIndex, primMask)) while (_BitScanForward(&primIndex, primMask))
{ {
uint32_t linkageCount = state.backendState.numAttributes; uint32_t linkageCount = state.backendState.numAttributes;
uint32_t numScalarAttribs = linkageCount * 4; uint32_t numScalarAttribs = linkageCount * 4;

View File

@ -409,7 +409,7 @@ public:
pa.Assemble(vertexClipCullOffset, vClipCullDistLo); pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi); pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
DWORD index; unsigned long index;
while (_BitScanForward(&index, cullMask)) while (_BitScanForward(&index, cullMask))
{ {
cullMask &= ~(1 << index); cullMask &= ~(1 << index);
@ -881,7 +881,7 @@ private:
const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets); const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
const float* pSrc = reinterpret_cast<const float*>(&vSrc); const float* pSrc = reinterpret_cast<const float*>(&vSrc);
uint32_t mask = SIMD_T::movemask_ps(vMask); uint32_t mask = SIMD_T::movemask_ps(vMask);
DWORD lane; unsigned long lane;
while (_BitScanForward(&lane, mask)) while (_BitScanForward(&lane, mask))
{ {
mask &= ~(1 << lane); mask &= ~(1 << lane);

View File

@ -534,7 +534,7 @@ static void StreamOut(
for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
{ {
DWORD slot = 0; unsigned long slot = 0;
uint64_t soMask = soState.streamMasks[streamIndex]; uint64_t soMask = soState.streamMasks[streamIndex];
// Write all entries into primitive data buffer for SOS. // Write all entries into primitive data buffer for SOS.

View File

@ -382,7 +382,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// overwrite texcoord for point sprites // overwrite texcoord for point sprites
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
DWORD texCoordAttrib = 0; unsigned long texCoordAttrib = 0;
while (_BitScanForward(&texCoordAttrib, texCoordMask)) while (_BitScanForward(&texCoordAttrib, texCoordMask))
{ {
@ -424,7 +424,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
if (isPointSpriteTexCoordEnabled) if (isPointSpriteTexCoordEnabled)
{ {
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
DWORD texCoordAttrib = 0; unsigned long texCoordAttrib = 0;
while (_BitScanForward(&texCoordAttrib, texCoordMask)) while (_BitScanForward(&texCoordAttrib, texCoordMask))
{ {

View File

@ -1511,7 +1511,7 @@ void GetRenderHotTiles(DRAW_CONTEXT* pDC,
template <typename RT> template <typename RT>
INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers) INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
{ {
DWORD rt = 0; unsigned long rt = 0;
while (_BitScanForward(&rt, colorHotTileMask)) while (_BitScanForward(&rt, colorHotTileMask))
{ {
colorHotTileMask &= ~(1 << rt); colorHotTileMask &= ~(1 << rt);
@ -1527,7 +1527,7 @@ INLINE void StepRasterTileY(uint32_t colorHotTileMask,
RenderOutputBuffers& buffers, RenderOutputBuffers& buffers,
RenderOutputBuffers& startBufferRow) RenderOutputBuffers& startBufferRow)
{ {
DWORD rt = 0; unsigned long rt = 0;
while (_BitScanForward(&rt, colorHotTileMask)) while (_BitScanForward(&rt, colorHotTileMask))
{ {
colorHotTileMask &= ~(1 << rt); colorHotTileMask &= ~(1 << rt);

View File

@ -39,7 +39,7 @@ extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256
SIMD256::store_ps(src, vSrc); SIMD256::store_ps(src, vSrc);
SIMD256::store_si((SIMD256::Integer*)indices, vIndices); SIMD256::store_si((SIMD256::Integer*)indices, vIndices);
DWORD index; unsigned long index;
while (_BitScanForward(&index, mask)) while (_BitScanForward(&index, mask))
{ {
mask &= ~(1 << index); mask &= ~(1 << index);

View File

@ -91,7 +91,7 @@ struct StreamOutJit : public BuilderGfxMem
Value* PackMask(uint32_t bitmask) Value* PackMask(uint32_t bitmask)
{ {
std::vector<Constant*> indices(4, C(0)); std::vector<Constant*> indices(4, C(0));
DWORD index; unsigned long index;
uint32_t elem = 0; uint32_t elem = 0;
while (_BitScanForward(&index, bitmask)) while (_BitScanForward(&index, bitmask))
{ {