swr/rast: WIP - Widen fetch shader to SIMD16
Widen vertex gather/storage to SIMD16 for all component types. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
6d5275498a
commit
36e276b6b0
|
@ -70,6 +70,9 @@ struct FetchJit : public Builder
|
|||
#else
|
||||
void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
|
||||
#endif
|
||||
#if USE_SIMD16_BUILDER
|
||||
void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
|
||||
#endif
|
||||
|
||||
typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
|
||||
uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
|
||||
|
@ -78,6 +81,9 @@ struct FetchJit : public Builder
|
|||
#else
|
||||
void Shuffle16bpcGather(Shuffle16bpcArgs &args);
|
||||
#endif
|
||||
#if USE_SIMD16_BUILDER
|
||||
void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
|
||||
#endif
|
||||
|
||||
void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
|
||||
#if USE_SIMD16_BUILDER
|
||||
|
@ -726,7 +732,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB
|
|||
// only works if pixel size is <= 32bits
|
||||
SWR_ASSERT(info.bpp <= 32);
|
||||
|
||||
Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
|
||||
Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
|
||||
|
||||
for (uint32_t comp = 0; comp < 4; ++comp)
|
||||
{
|
||||
|
@ -825,6 +831,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
Value* vVertexElements[4];
|
||||
#if USE_SIMD16_GATHERS
|
||||
Value* vVertexElements2[4];
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *pVtxSrc2[4];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
|
||||
|
@ -961,6 +970,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
#if USE_SIMD16_GATHERS
|
||||
// override cur indices with 0 if pitch is 0
|
||||
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
|
||||
vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
|
||||
vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
|
||||
|
||||
// are vertices partially OOB?
|
||||
|
@ -983,7 +993,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
|
||||
// only fetch lanes that pass both tests
|
||||
vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
|
||||
vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
|
||||
vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1074,15 +1084,32 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
{
|
||||
if (isComponentEnabled(compMask, c))
|
||||
{
|
||||
vVertexElements[currentVertexElement] = pResults[c];
|
||||
#if USE_SIMD16_BUILDER
|
||||
// pack adjacent pairs of SIMD8s into SIMD16s
|
||||
pVtxSrc2[currentVertexElement] = VUNDEF2_F();
|
||||
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0);
|
||||
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
|
||||
|
||||
#else
|
||||
vVertexElements[currentVertexElement] = pResults[c];
|
||||
vVertexElements2[currentVertexElement] = pResults2[c];
|
||||
currentVertexElement++;
|
||||
|
||||
#endif
|
||||
currentVertexElement += 1;
|
||||
|
||||
if (currentVertexElement > 3)
|
||||
{
|
||||
#if USE_SIMD16_BUILDER
|
||||
// store SIMD16s
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
|
||||
|
||||
#else
|
||||
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
|
||||
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
|
||||
|
||||
#endif
|
||||
outputElt += 1;
|
||||
|
||||
// reset to the next vVertexElement to output
|
||||
|
@ -1113,9 +1140,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
else if(info.type[0] == SWR_TYPE_FLOAT)
|
||||
{
|
||||
///@todo: support 64 bit vb accesses
|
||||
Value* gatherSrc = VIMMED1(0.0f);
|
||||
Value *gatherSrc = VIMMED1(0.0f);
|
||||
#if USE_SIMD16_GATHERS
|
||||
Value* gatherSrc2 = VIMMED1(0.0f);
|
||||
Value *gatherSrc2 = VIMMED1(0.0f);
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherSrc16 = VIMMED2_1(0.0f);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
|
||||
|
@ -1127,8 +1157,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
case 16:
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
Value* vGatherResult[2];
|
||||
Value* vGatherResult2[2];
|
||||
Value *vGatherResult[2];
|
||||
Value *vGatherResult2[2];
|
||||
|
||||
// if we have at least one component out of x or y to fetch
|
||||
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
|
||||
|
@ -1140,6 +1170,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
|
||||
//
|
||||
}
|
||||
else
|
||||
{
|
||||
vGatherResult[0] = VUNDEF_I();
|
||||
vGatherResult2[0] = VUNDEF_I();
|
||||
}
|
||||
|
||||
// if we have at least one component out of z or w to fetch
|
||||
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
|
||||
|
@ -1154,11 +1189,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
|
||||
//
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
vGatherResult[1] = VUNDEF_I();
|
||||
vGatherResult2[1] = VUNDEF_I();
|
||||
}
|
||||
|
||||
// if we have at least one component to shuffle into place
|
||||
if (compMask)
|
||||
{
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherResult[2];
|
||||
|
||||
gatherResult[0] = VUNDEF2_I();
|
||||
gatherResult[1] = VUNDEF2_I();
|
||||
|
||||
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
|
||||
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
|
||||
|
||||
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
|
||||
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
|
||||
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
|
||||
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
|
||||
|
||||
// Shuffle gathered components into place in simdvertex struct
|
||||
Shuffle16bpcGather2(args); // outputs to vVertexElements ref
|
||||
#else
|
||||
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
|
||||
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
|
||||
Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
|
||||
|
@ -1167,6 +1226,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// Shuffle gathered components into place in simdvertex struct
|
||||
Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
|
||||
Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
Value* vGatherResult[2];
|
||||
|
@ -1209,12 +1269,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
break;
|
||||
case 32:
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *pVtxSrc2[4];
|
||||
|
||||
#endif
|
||||
#endif
|
||||
for (uint32_t i = 0; i < 4; i += 1)
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
|
@ -1231,10 +1285,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
|
||||
Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *src = VUNDEF2_F();
|
||||
src = INSERT2_F(src, gatherSrc, 0);
|
||||
src = INSERT2_F(src, gatherSrc2, 1);
|
||||
|
||||
Value *indices = VUNDEF2_I();
|
||||
indices = INSERT2_I(indices, vShiftedOffsets, 0);
|
||||
indices = INSERT2_I(indices, vShiftedOffsets2, 1);
|
||||
|
@ -1243,12 +1293,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
mask = INSERT2_I(mask, vGatherMask, 0);
|
||||
mask = INSERT2_I(mask, vGatherMask2, 1);
|
||||
|
||||
pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2);
|
||||
#if 1
|
||||
|
||||
vVertexElements[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0);
|
||||
vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1);
|
||||
#endif
|
||||
pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
|
||||
#else
|
||||
vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
|
||||
vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
|
||||
|
@ -1384,24 +1429,45 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
|
||||
Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
|
||||
|
||||
vVertexElements[currentVertexElement] = pGather;
|
||||
#if USE_SIMD16_BUILDER
|
||||
// pack adjacent pairs of SIMD8s into SIMD16s
|
||||
pVtxSrc2[currentVertexElement] = VUNDEF2_F();
|
||||
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
|
||||
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
|
||||
|
||||
#else
|
||||
vVertexElements[currentVertexElement] = pGather;
|
||||
vVertexElements2[currentVertexElement] = pGather2;
|
||||
|
||||
#endif
|
||||
currentVertexElement += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
#if USE_SIMD16_BUILDER
|
||||
pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
|
||||
|
||||
#else
|
||||
vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
|
||||
vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
|
||||
|
||||
#endif
|
||||
currentVertexElement += 1;
|
||||
}
|
||||
|
||||
if (currentVertexElement > 3)
|
||||
{
|
||||
#if USE_SIMD16_BUILDER
|
||||
// store SIMD16s
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
|
||||
|
||||
#else
|
||||
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
|
||||
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
|
||||
|
||||
#endif
|
||||
outputElt += 1;
|
||||
|
||||
// reset to the next vVertexElement to output
|
||||
|
@ -1522,10 +1588,25 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
#if USE_SIMD16_GATHERS
|
||||
Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
|
||||
Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
|
||||
|
||||
// e.g. result of an 8x32bit integer gather for 8bit components
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherResult = VUNDEF2_I();
|
||||
|
||||
gatherResult = INSERT2_I(gatherResult, vGatherResult, 0);
|
||||
gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
|
||||
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
|
||||
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
|
||||
|
||||
// Shuffle gathered components into place in simdvertex struct
|
||||
Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref
|
||||
#else
|
||||
Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
|
||||
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
|
||||
Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
|
||||
|
@ -1534,6 +1615,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// Shuffle gathered components into place in simdvertex struct
|
||||
Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
|
||||
Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
|
||||
#endif
|
||||
#else
|
||||
Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
|
||||
// e.g. result of an 8x32bit integer gather for 8bit components
|
||||
|
@ -1569,6 +1651,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
|
||||
//
|
||||
}
|
||||
else
|
||||
{
|
||||
vGatherResult[0] = VUNDEF_I();
|
||||
vGatherResult2[0] = VUNDEF_I();
|
||||
}
|
||||
|
||||
// if we have at least one component out of z or w to fetch
|
||||
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
|
||||
|
@ -1583,10 +1670,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
|
||||
//
|
||||
}
|
||||
else
|
||||
{
|
||||
vGatherResult[1] = VUNDEF_I();
|
||||
vGatherResult2[1] = VUNDEF_I();
|
||||
}
|
||||
|
||||
// if we have at least one component to shuffle into place
|
||||
if (compMask)
|
||||
{
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherResult[2];
|
||||
|
||||
gatherResult[0] = VUNDEF2_I();
|
||||
gatherResult[1] = VUNDEF2_I();
|
||||
|
||||
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
|
||||
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
|
||||
|
||||
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
|
||||
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
|
||||
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
|
||||
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
|
||||
|
||||
// Shuffle gathered components into place in simdvertex struct
|
||||
Shuffle16bpcGather2(args); // outputs to vVertexElements ref
|
||||
#else
|
||||
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
|
||||
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
|
||||
Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
|
||||
|
@ -1595,6 +1707,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// Shuffle gathered components into place in simdvertex struct
|
||||
Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
|
||||
Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
Value* vGatherResult[2];
|
||||
|
@ -1665,8 +1778,18 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
|
||||
}
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
// pack adjacent pairs of SIMD8s into SIMD16s
|
||||
pVtxSrc2[currentVertexElement] = VUNDEF2_F();
|
||||
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
|
||||
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
|
||||
|
||||
#else
|
||||
vVertexElements[currentVertexElement] = pGather;
|
||||
vVertexElements2[currentVertexElement] = pGather2;
|
||||
|
||||
#endif
|
||||
|
||||
// e.g. result of a single 8x32bit integer gather for 32bit components
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
|
||||
|
@ -1698,9 +1821,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
{
|
||||
#if USE_SIMD16_SHADERS
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
|
||||
|
||||
#else
|
||||
vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
|
||||
vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
|
||||
|
||||
#endif
|
||||
currentVertexElement += 1;
|
||||
#else
|
||||
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
|
||||
|
@ -1713,9 +1841,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
if (currentVertexElement > 3)
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
// store SIMD16s
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
|
||||
|
||||
#else
|
||||
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
|
||||
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
|
||||
|
||||
#endif
|
||||
outputElt += 1;
|
||||
#else
|
||||
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
|
||||
|
@ -1740,9 +1876,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
if (currentVertexElement > 0)
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
// store SIMD16s
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
|
||||
|
||||
#else
|
||||
StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
|
||||
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
|
||||
|
||||
#endif
|
||||
outputElt += 1;
|
||||
#else
|
||||
StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
|
||||
|
@ -2092,6 +2236,251 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
|
|||
}
|
||||
}
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
|
||||
{
|
||||
// Unpack tuple args
|
||||
Value*& vGatherResult = std::get<0>(args);
|
||||
Value* pVtxOut = std::get<1>(args);
|
||||
const Instruction::CastOps extendType = std::get<2>(args);
|
||||
const ConversionType conversionType = std::get<3>(args);
|
||||
uint32_t ¤tVertexElement = std::get<4>(args);
|
||||
uint32_t &outputElt = std::get<5>(args);
|
||||
const ComponentEnable compMask = std::get<6>(args);
|
||||
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
|
||||
Value* (&vVertexElements)[4] = std::get<8>(args);
|
||||
const uint32_t(&swizzle)[4] = std::get<9>(args);
|
||||
|
||||
// cast types
|
||||
Type *vGatherTy = mSimdInt32Ty;
|
||||
Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
|
||||
|
||||
// have to do extra work for sign extending
|
||||
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
|
||||
{
|
||||
Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
|
||||
Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
|
||||
|
||||
// shuffle mask, including any swizzling
|
||||
const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
|
||||
const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
|
||||
Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
|
||||
char(y), char(y + 4), char(y + 8), char(y + 12),
|
||||
char(z), char(z + 4), char(z + 8), char(z + 12),
|
||||
char(w), char(w + 4), char(w + 8), char(w + 12),
|
||||
char(x), char(x + 4), char(x + 8), char(x + 12),
|
||||
char(y), char(y + 4), char(y + 8), char(y + 12),
|
||||
char(z), char(z + 4), char(z + 8), char(z + 12),
|
||||
char(w), char(w + 4), char(w + 8), char(w + 12) });
|
||||
|
||||
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
|
||||
|
||||
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
|
||||
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
|
||||
|
||||
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
|
||||
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
|
||||
|
||||
// after pshufb: group components together in each 128bit lane
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
|
||||
|
||||
Value *vi128XY_lo = nullptr;
|
||||
Value *vi128XY_hi = nullptr;
|
||||
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
|
||||
{
|
||||
vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
|
||||
vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
|
||||
|
||||
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
|
||||
}
|
||||
|
||||
// do the same for zw components
|
||||
Value *vi128ZW_lo = nullptr;
|
||||
Value *vi128ZW_hi = nullptr;
|
||||
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
|
||||
{
|
||||
vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
|
||||
vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
|
||||
}
|
||||
|
||||
// init denormalize variables if needed
|
||||
Instruction::CastOps fpCast;
|
||||
Value *conversionFactor;
|
||||
|
||||
switch (conversionType)
|
||||
{
|
||||
case CONVERT_NORMALIZED:
|
||||
fpCast = Instruction::CastOps::SIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0 / 127.0));
|
||||
break;
|
||||
case CONVERT_SSCALED:
|
||||
fpCast = Instruction::CastOps::SIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0));
|
||||
break;
|
||||
case CONVERT_USCALED:
|
||||
SWR_INVALID("Type should not be sign extended!");
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(conversionType == CONVERT_NONE);
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
}
|
||||
|
||||
// sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
|
||||
for (uint32_t i = 0; i < 4; i++)
|
||||
{
|
||||
if (isComponentEnabled(compMask, i))
|
||||
{
|
||||
if (compCtrl[i] == ComponentControl::StoreSrc)
|
||||
{
|
||||
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
|
||||
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
|
||||
// if x or y, use vi128XY permute result, else use vi128ZW
|
||||
Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
|
||||
Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
|
||||
|
||||
// sign extend
|
||||
Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
|
||||
Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
|
||||
|
||||
// denormalize if needed
|
||||
if (conversionType != CONVERT_NONE)
|
||||
{
|
||||
temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
|
||||
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
|
||||
}
|
||||
|
||||
vVertexElements[currentVertexElement] = VUNDEF2_F();
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
|
||||
|
||||
currentVertexElement += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
|
||||
}
|
||||
|
||||
if (currentVertexElement > 3)
|
||||
{
|
||||
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
|
||||
// reset to the next vVertexElement to output
|
||||
currentVertexElement = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// else zero extend
|
||||
else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
|
||||
{
|
||||
// init denormalize variables if needed
|
||||
Instruction::CastOps fpCast;
|
||||
Value *conversionFactor;
|
||||
|
||||
switch (conversionType)
|
||||
{
|
||||
case CONVERT_NORMALIZED:
|
||||
fpCast = Instruction::CastOps::UIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0 / 255.0));
|
||||
break;
|
||||
case CONVERT_USCALED:
|
||||
fpCast = Instruction::CastOps::UIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0));
|
||||
break;
|
||||
case CONVERT_SSCALED:
|
||||
SWR_INVALID("Type should not be zero extended!");
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(conversionType == CONVERT_NONE);
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
}
|
||||
|
||||
// shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
|
||||
for (uint32_t i = 0; i < 4; i++)
|
||||
{
|
||||
if (isComponentEnabled(compMask, i))
|
||||
{
|
||||
if (compCtrl[i] == ComponentControl::StoreSrc)
|
||||
{
|
||||
// pshufb masks for each component
|
||||
Value *vConstMask;
|
||||
switch (swizzle[i])
|
||||
{
|
||||
case 0:
|
||||
// x shuffle mask
|
||||
vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
|
||||
0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
|
||||
break;
|
||||
case 1:
|
||||
// y shuffle mask
|
||||
vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
|
||||
1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
|
||||
break;
|
||||
case 2:
|
||||
// z shuffle mask
|
||||
vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
|
||||
2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
|
||||
break;
|
||||
case 3:
|
||||
// w shuffle mask
|
||||
vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
|
||||
3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
|
||||
break;
|
||||
default:
|
||||
vConstMask = nullptr;
|
||||
break;
|
||||
}
|
||||
|
||||
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
|
||||
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
|
||||
|
||||
Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
|
||||
Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
|
||||
|
||||
// after pshufb for x channel
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// x000 x000 x000 x000 x000 x000 x000 x000
|
||||
|
||||
// denormalize if needed
|
||||
if (conversionType != CONVERT_NONE)
|
||||
{
|
||||
temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
|
||||
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
|
||||
}
|
||||
|
||||
vVertexElements[currentVertexElement] = VUNDEF2_F();
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
|
||||
|
||||
currentVertexElement += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
|
||||
}
|
||||
|
||||
if (currentVertexElement > 3)
|
||||
{
|
||||
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
|
||||
// reset to the next vVertexElement to output
|
||||
currentVertexElement = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SWR_INVALID("Unsupported conversion type");
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
|
||||
/// denormalizes if needed, converts to F32 if needed, and positions in
|
||||
|
@ -2318,6 +2707,272 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
|
|||
}
|
||||
}
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
|
||||
{
|
||||
// Unpack tuple args
|
||||
Value* (&vGatherResult)[2] = std::get<0>(args);
|
||||
Value* pVtxOut = std::get<1>(args);
|
||||
const Instruction::CastOps extendType = std::get<2>(args);
|
||||
const ConversionType conversionType = std::get<3>(args);
|
||||
uint32_t ¤tVertexElement = std::get<4>(args);
|
||||
uint32_t &outputElt = std::get<5>(args);
|
||||
const ComponentEnable compMask = std::get<6>(args);
|
||||
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
|
||||
Value* (&vVertexElements)[4] = std::get<8>(args);
|
||||
|
||||
// cast types
|
||||
Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
|
||||
Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
|
||||
|
||||
// have to do extra work for sign extending
|
||||
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
|
||||
{
|
||||
// is this PP float?
|
||||
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
|
||||
|
||||
Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
|
||||
Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
|
||||
|
||||
// shuffle mask
|
||||
Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
|
||||
Value *vi128XY = nullptr;
|
||||
Value *vi128XY_lo = nullptr;
|
||||
Value *vi128XY_hi = nullptr;
|
||||
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
|
||||
{
|
||||
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
|
||||
|
||||
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
|
||||
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
|
||||
|
||||
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
|
||||
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
|
||||
|
||||
// after pshufb: group components together in each 128bit lane
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
|
||||
|
||||
vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
|
||||
vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
|
||||
|
||||
// after PERMD: move and pack xy components into each 128bit lane
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
|
||||
#if 0
|
||||
|
||||
vi128XY = VUNDEF2_I();
|
||||
vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
|
||||
vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// do the same for zw components
|
||||
Value *vi128ZW = nullptr;
|
||||
Value *vi128ZW_lo = nullptr;
|
||||
Value *vi128ZW_hi = nullptr;
|
||||
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
|
||||
{
|
||||
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
|
||||
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
|
||||
|
||||
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
|
||||
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
|
||||
|
||||
vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
|
||||
vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
|
||||
#if 0
|
||||
|
||||
vi128ZW = VUNDEF2_I();
|
||||
vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
|
||||
vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// init denormalize variables if needed
|
||||
Instruction::CastOps IntToFpCast;
|
||||
Value *conversionFactor;
|
||||
|
||||
switch (conversionType)
|
||||
{
|
||||
case CONVERT_NORMALIZED:
|
||||
IntToFpCast = Instruction::CastOps::SIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0 / 32767.0));
|
||||
break;
|
||||
case CONVERT_SSCALED:
|
||||
IntToFpCast = Instruction::CastOps::SIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0));
|
||||
break;
|
||||
case CONVERT_USCALED:
|
||||
SWR_INVALID("Type should not be sign extended!");
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(conversionType == CONVERT_NONE);
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
}
|
||||
|
||||
// sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
|
||||
for (uint32_t i = 0; i < 4; i++)
|
||||
{
|
||||
if (isComponentEnabled(compMask, i))
|
||||
{
|
||||
if (compCtrl[i] == ComponentControl::StoreSrc)
|
||||
{
|
||||
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
|
||||
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
|
||||
// if x or y, use vi128XY permute result, else use vi128ZW
|
||||
Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
|
||||
Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
|
||||
|
||||
if (bFP)
|
||||
{
|
||||
// extract 128 bit lanes to sign extend each component
|
||||
Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
|
||||
Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
|
||||
|
||||
vVertexElements[currentVertexElement] = VUNDEF2_F();
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// extract 128 bit lanes to sign extend each component
|
||||
Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
|
||||
Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
|
||||
|
||||
// denormalize if needed
|
||||
if (conversionType != CONVERT_NONE)
|
||||
{
|
||||
temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
|
||||
temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
|
||||
}
|
||||
|
||||
vVertexElements[currentVertexElement] = VUNDEF2_F();
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
|
||||
}
|
||||
|
||||
currentVertexElement += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
|
||||
}
|
||||
|
||||
if (currentVertexElement > 3)
|
||||
{
|
||||
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
|
||||
// reset to the next vVertexElement to output
|
||||
currentVertexElement = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// else zero extend
|
||||
else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
|
||||
{
|
||||
// pshufb masks for each component
|
||||
Value *vConstMask[2];
|
||||
|
||||
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
|
||||
{
|
||||
// x/z shuffle mask
|
||||
vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
|
||||
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
|
||||
}
|
||||
|
||||
if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
|
||||
{
|
||||
// y/w shuffle mask
|
||||
vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
|
||||
2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
|
||||
}
|
||||
|
||||
// init denormalize variables if needed
|
||||
Instruction::CastOps fpCast;
|
||||
Value* conversionFactor;
|
||||
|
||||
switch (conversionType)
|
||||
{
|
||||
case CONVERT_NORMALIZED:
|
||||
fpCast = Instruction::CastOps::UIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0 / 65535.0));
|
||||
break;
|
||||
case CONVERT_USCALED:
|
||||
fpCast = Instruction::CastOps::UIToFP;
|
||||
conversionFactor = VIMMED1((float)(1.0f));
|
||||
break;
|
||||
case CONVERT_SSCALED:
|
||||
SWR_INVALID("Type should not be zero extended!");
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(conversionType == CONVERT_NONE);
|
||||
conversionFactor = nullptr;
|
||||
break;
|
||||
}
|
||||
|
||||
// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
|
||||
for (uint32_t i = 0; i < 4; i++)
|
||||
{
|
||||
if (isComponentEnabled(compMask, i))
|
||||
{
|
||||
if (compCtrl[i] == ComponentControl::StoreSrc)
|
||||
{
|
||||
// select correct constMask for x/z or y/w pshufb
|
||||
uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
|
||||
// if x or y, use vi128XY permute result, else use vi128ZW
|
||||
uint32_t selectedGather = (i < 2) ? 0 : 1;
|
||||
|
||||
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
|
||||
|
||||
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
|
||||
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
|
||||
|
||||
Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
|
||||
Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
|
||||
|
||||
// after pshufb mask for x channel; z uses the same shuffle from the second gather
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
|
||||
|
||||
// denormalize if needed
|
||||
if (conversionType != CONVERT_NONE)
|
||||
{
|
||||
temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
|
||||
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
|
||||
}
|
||||
|
||||
vVertexElements[currentVertexElement] = VUNDEF2_F();
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
|
||||
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
|
||||
|
||||
currentVertexElement += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
|
||||
}
|
||||
|
||||
if (currentVertexElement > 3)
|
||||
{
|
||||
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
|
||||
// reset to the next vVertexElement to output
|
||||
currentVertexElement = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SWR_INVALID("Unsupported conversion type");
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Output a simdvertex worth of elements to the current outputElt
|
||||
/// @param pVtxOut - base address of VIN output struct
|
||||
|
@ -2438,7 +3093,14 @@ Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
|
|||
case Store1Int: return VIMMED2_1(1);
|
||||
case StoreVertexId:
|
||||
{
|
||||
Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty);
|
||||
Value* pId = VUNDEF2_F();
|
||||
|
||||
Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
|
||||
Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
|
||||
|
||||
pId = INSERT2_F(pId, pId_lo, 0);
|
||||
pId = INSERT2_F(pId, pId_hi, 1);
|
||||
|
||||
return VBROADCAST2(pId);
|
||||
}
|
||||
case StoreInstanceId:
|
||||
|
|
Loading…
Reference in New Issue