swr: fix _BitScanForward64 on unix
it must apply to 64 bits types, and use the ctzll intrinsic instead of ctz Reviewed-by: Jose Fonseca <jfonseca@vmware.com> Reviewed-by: Krzysztof Raszkowski <krzysztof.raszkowski@intel.com> Reviewed-by: Jan Zielinski <jan.zielinski@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6705>
This commit is contained in:
parent
82c49a66c0
commit
12b8ad8f21
|
@ -70,7 +70,7 @@ UINT pdep_u32(UINT a, UINT mask)
|
|||
|
||||
// copied from http://wm.ite.pl/articles/pdep-soft-emu.html
|
||||
// using bsf instead of funky loop
|
||||
DWORD maskIndex;
|
||||
unsigned long maskIndex = 0;
|
||||
while (_BitScanForward(&maskIndex, mask))
|
||||
{
|
||||
// 1. isolate lowest set bit of mask
|
||||
|
@ -100,7 +100,7 @@ UINT pext_u32(UINT a, UINT mask)
|
|||
return _pext_u32(a, mask);
|
||||
#else
|
||||
UINT result = 0;
|
||||
DWORD maskIndex;
|
||||
unsigned long maskIndex;
|
||||
uint32_t currentBit = 0;
|
||||
while (_BitScanForward(&maskIndex, mask))
|
||||
{
|
||||
|
|
|
@ -224,33 +224,30 @@ static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
|
|||
#endif
|
||||
#endif
|
||||
|
||||
inline unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask)
|
||||
inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
|
||||
{
|
||||
*Index = __builtin_ctzll(Mask);
|
||||
return (Mask != 0);
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
|
||||
{
|
||||
*Index = __builtin_ctz(Mask);
|
||||
return (Mask != 0);
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanForward(unsigned int* Index, unsigned int Mask)
|
||||
inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
|
||||
{
|
||||
*Index = __builtin_ctz(Mask);
|
||||
*Index = 63 - __builtin_clzll(Mask);
|
||||
return (Mask != 0);
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask)
|
||||
{
|
||||
*Index = 63 - __builtin_clz(Mask);
|
||||
return (Mask != 0);
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanReverse(unsigned int* Index, unsigned int Mask)
|
||||
inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
|
||||
{
|
||||
*Index = 31 - __builtin_clz(Mask);
|
||||
return (Mask != 0);
|
||||
}
|
||||
|
||||
#define _BitScanForward64 _BitScanForward
|
||||
#define _BitScanReverse64 _BitScanReverse
|
||||
|
||||
inline void* AlignedMalloc(size_t size, size_t alignment)
|
||||
{
|
||||
void* ret;
|
||||
|
|
|
@ -469,7 +469,7 @@ static SIMDINLINE Float SIMDCALL
|
|||
uint32_t* pOffsets = (uint32_t*)&idx;
|
||||
Float vResult = old;
|
||||
float* pResult = (float*)&vResult;
|
||||
DWORD index;
|
||||
unsigned long index;
|
||||
uint32_t umask = movemask_ps(mask);
|
||||
while (_BitScanForward(&index, umask))
|
||||
{
|
||||
|
|
|
@ -635,7 +635,7 @@ static SIMDINLINE Float SIMDCALL
|
|||
uint32_t* pOffsets = (uint32_t*)&idx;
|
||||
Float vResult = old;
|
||||
float* pResult = (float*)&vResult;
|
||||
DWORD index;
|
||||
unsigned long index = 0;
|
||||
uint32_t umask = movemask_ps(mask);
|
||||
while (_BitScanForward(&index, umask))
|
||||
{
|
||||
|
|
|
@ -987,7 +987,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
|
|||
streamMasks |= pState->state.soState.streamMasks[i];
|
||||
}
|
||||
|
||||
DWORD maxAttrib;
|
||||
unsigned long maxAttrib;
|
||||
if (_BitScanReverse64(&maxAttrib, streamMasks))
|
||||
{
|
||||
pState->state.feNumAttributes =
|
||||
|
@ -1027,7 +1027,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
|
|||
// Disable hottile for surfaces with no writes
|
||||
if (psState.pfnPixelShader != nullptr)
|
||||
{
|
||||
DWORD rt;
|
||||
unsigned long rt;
|
||||
uint32_t rtMask = pState->state.psState.renderTargetMask;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
|
|
|
@ -609,7 +609,7 @@ inline void SetupRenderBuffers(uint8_t* pColorBuffer[SWR_NUM_RENDERT
|
|||
uint32_t colorHotTileMask,
|
||||
RenderOutputBuffers& renderBuffers)
|
||||
{
|
||||
DWORD index;
|
||||
unsigned long index;
|
||||
while (_BitScanForward(&index, colorHotTileMask))
|
||||
{
|
||||
assert(index < SWR_NUM_RENDERTARGETS);
|
||||
|
@ -937,7 +937,7 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
|
|||
simdvector blendSrc;
|
||||
simdvector blendOut;
|
||||
|
||||
DWORD rt;
|
||||
unsigned long rt;
|
||||
while (_BitScanForward(&rt, renderTargetMask))
|
||||
{
|
||||
renderTargetMask &= ~(1 << rt);
|
||||
|
@ -1250,7 +1250,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
|
|||
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
DWORD rt;
|
||||
unsigned long rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
|
|
|
@ -302,7 +302,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
|
|||
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
DWORD rt;
|
||||
unsigned long rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
|
|
|
@ -285,7 +285,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
|
|||
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
DWORD rt;
|
||||
unsigned long rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
|
|
|
@ -179,7 +179,7 @@ INLINE void ProcessAttributes(
|
|||
uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
|
||||
if (mask)
|
||||
{
|
||||
DWORD comp;
|
||||
unsigned long comp;
|
||||
while (_BitScanForward(&comp, mask))
|
||||
{
|
||||
mask &= ~(1 << comp);
|
||||
|
@ -245,7 +245,7 @@ void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
|
|||
float* pRecipW,
|
||||
float* pUserClipBuffer)
|
||||
{
|
||||
DWORD clipDist;
|
||||
unsigned long clipDist;
|
||||
uint32_t clipDistMask = state.clipDistanceMask;
|
||||
while (_BitScanForward(&clipDist, clipDistMask))
|
||||
{
|
||||
|
@ -1122,7 +1122,7 @@ endBinTriangles:
|
|||
TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
|
||||
|
||||
// scan remaining valid triangles and bin each separately
|
||||
while (_BitScanForward((DWORD*)&triIndex, triMask))
|
||||
while (_BitScanForward((unsigned long*)&triIndex, triMask))
|
||||
{
|
||||
uint32_t linkageCount = state.backendState.numAttributes;
|
||||
uint32_t numScalarAttribs = linkageCount * 4;
|
||||
|
@ -1363,7 +1363,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
|
|||
const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
|
||||
|
||||
// scan remaining valid triangles and bin each separately
|
||||
while (_BitScanForward((DWORD*)&primIndex, primMask))
|
||||
while (_BitScanForward((unsigned long*)&primIndex, primMask))
|
||||
{
|
||||
uint32_t linkageCount = backendState.numAttributes;
|
||||
uint32_t numScalarAttribs = linkageCount * 4;
|
||||
|
@ -1519,7 +1519,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT* pDC,
|
|||
// scan remaining valid prims and bin each separately
|
||||
const SWR_BACKEND_STATE& backendState = state.backendState;
|
||||
uint32_t primIndex;
|
||||
while (_BitScanForward((DWORD*)&primIndex, primMask))
|
||||
while (_BitScanForward((unsigned long*)&primIndex, primMask))
|
||||
{
|
||||
uint32_t linkageCount = backendState.numAttributes;
|
||||
uint32_t numScalarAttribs = linkageCount * 4;
|
||||
|
@ -1818,8 +1818,8 @@ void BinPostSetupLinesImpl(DRAW_CONTEXT* pDC,
|
|||
TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
|
||||
|
||||
// scan remaining valid prims and bin each separately
|
||||
uint32_t primIndex;
|
||||
while (_BitScanForward((DWORD*)&primIndex, primMask))
|
||||
unsigned long primIndex;
|
||||
while (_BitScanForward(&primIndex, primMask))
|
||||
{
|
||||
uint32_t linkageCount = state.backendState.numAttributes;
|
||||
uint32_t numScalarAttribs = linkageCount * 4;
|
||||
|
|
|
@ -409,7 +409,7 @@ public:
|
|||
pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
|
||||
pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
|
||||
|
||||
DWORD index;
|
||||
unsigned long index;
|
||||
while (_BitScanForward(&index, cullMask))
|
||||
{
|
||||
cullMask &= ~(1 << index);
|
||||
|
@ -881,7 +881,7 @@ private:
|
|||
const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
|
||||
const float* pSrc = reinterpret_cast<const float*>(&vSrc);
|
||||
uint32_t mask = SIMD_T::movemask_ps(vMask);
|
||||
DWORD lane;
|
||||
unsigned long lane;
|
||||
while (_BitScanForward(&lane, mask))
|
||||
{
|
||||
mask &= ~(1 << lane);
|
||||
|
|
|
@ -534,7 +534,7 @@ static void StreamOut(
|
|||
|
||||
for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
|
||||
{
|
||||
DWORD slot = 0;
|
||||
unsigned long slot = 0;
|
||||
uint64_t soMask = soState.streamMasks[streamIndex];
|
||||
|
||||
// Write all entries into primitive data buffer for SOS.
|
||||
|
|
|
@ -382,7 +382,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
|
|||
|
||||
// overwrite texcoord for point sprites
|
||||
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
|
||||
DWORD texCoordAttrib = 0;
|
||||
unsigned long texCoordAttrib = 0;
|
||||
|
||||
while (_BitScanForward(&texCoordAttrib, texCoordMask))
|
||||
{
|
||||
|
@ -424,7 +424,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
|
|||
if (isPointSpriteTexCoordEnabled)
|
||||
{
|
||||
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
|
||||
DWORD texCoordAttrib = 0;
|
||||
unsigned long texCoordAttrib = 0;
|
||||
|
||||
while (_BitScanForward(&texCoordAttrib, texCoordMask))
|
||||
{
|
||||
|
|
|
@ -1511,7 +1511,7 @@ void GetRenderHotTiles(DRAW_CONTEXT* pDC,
|
|||
template <typename RT>
|
||||
INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
|
||||
{
|
||||
DWORD rt = 0;
|
||||
unsigned long rt = 0;
|
||||
while (_BitScanForward(&rt, colorHotTileMask))
|
||||
{
|
||||
colorHotTileMask &= ~(1 << rt);
|
||||
|
@ -1527,7 +1527,7 @@ INLINE void StepRasterTileY(uint32_t colorHotTileMask,
|
|||
RenderOutputBuffers& buffers,
|
||||
RenderOutputBuffers& startBufferRow)
|
||||
{
|
||||
DWORD rt = 0;
|
||||
unsigned long rt = 0;
|
||||
while (_BitScanForward(&rt, colorHotTileMask))
|
||||
{
|
||||
colorHotTileMask &= ~(1 << rt);
|
||||
|
|
|
@ -39,7 +39,7 @@ extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256
|
|||
SIMD256::store_ps(src, vSrc);
|
||||
SIMD256::store_si((SIMD256::Integer*)indices, vIndices);
|
||||
|
||||
DWORD index;
|
||||
unsigned long index;
|
||||
while (_BitScanForward(&index, mask))
|
||||
{
|
||||
mask &= ~(1 << index);
|
||||
|
|
|
@ -91,7 +91,7 @@ struct StreamOutJit : public BuilderGfxMem
|
|||
Value* PackMask(uint32_t bitmask)
|
||||
{
|
||||
std::vector<Constant*> indices(4, C(0));
|
||||
DWORD index;
|
||||
unsigned long index;
|
||||
uint32_t elem = 0;
|
||||
while (_BitScanForward(&index, bitmask))
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue