swr/rast: WIP - Widen fetch shader to SIMD16

Widen vertex gather/storage to SIMD16 for all component types.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Tim Rowley 2017-12-03 18:49:29 -06:00
parent 6d5275498a
commit 36e276b6b0
1 changed files with 689 additions and 27 deletions

View File

@ -70,6 +70,9 @@ struct FetchJit : public Builder
#else
void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
#endif
#if USE_SIMD16_BUILDER
void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
#endif
typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
@ -78,6 +81,9 @@ struct FetchJit : public Builder
#else
void Shuffle16bpcGather(Shuffle16bpcArgs &args);
#endif
#if USE_SIMD16_BUILDER
void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
#endif
void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
#if USE_SIMD16_BUILDER
@ -726,7 +732,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB
// only works if pixel size is <= 32bits
SWR_ASSERT(info.bpp <= 32);
Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
for (uint32_t comp = 0; comp < 4; ++comp)
{
@ -825,6 +831,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value* vVertexElements[4];
#if USE_SIMD16_GATHERS
Value* vVertexElements2[4];
#if USE_SIMD16_BUILDER
Value *pVtxSrc2[4];
#endif
#endif
Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
@ -961,6 +970,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
#if USE_SIMD16_GATHERS
// override cur indices with 0 if pitch is 0
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
// are vertices partially OOB?
@ -983,7 +993,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// only fetch lanes that pass both tests
vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
}
else
{
@ -1074,15 +1084,32 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
{
if (isComponentEnabled(compMask, c))
{
vVertexElements[currentVertexElement] = pResults[c];
#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
pVtxSrc2[currentVertexElement] = VUNDEF2_F();
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0);
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
#else
vVertexElements[currentVertexElement] = pResults[c];
vVertexElements2[currentVertexElement] = pResults2[c];
currentVertexElement++;
#endif
currentVertexElement += 1;
if (currentVertexElement > 3)
{
#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
#else
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
#endif
outputElt += 1;
// reset to the next vVertexElement to output
@ -1113,9 +1140,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
else if(info.type[0] == SWR_TYPE_FLOAT)
{
///@todo: support 64 bit vb accesses
Value* gatherSrc = VIMMED1(0.0f);
Value *gatherSrc = VIMMED1(0.0f);
#if USE_SIMD16_GATHERS
Value* gatherSrc2 = VIMMED1(0.0f);
Value *gatherSrc2 = VIMMED1(0.0f);
#if USE_SIMD16_BUILDER
Value *gatherSrc16 = VIMMED2_1(0.0f);
#endif
#endif
SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
@ -1127,8 +1157,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
case 16:
{
#if USE_SIMD16_GATHERS
Value* vGatherResult[2];
Value* vGatherResult2[2];
Value *vGatherResult[2];
Value *vGatherResult2[2];
// if we have at least one component out of x or y to fetch
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
@ -1140,6 +1170,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
//
}
else
{
vGatherResult[0] = VUNDEF_I();
vGatherResult2[0] = VUNDEF_I();
}
// if we have at least one component out of z or w to fetch
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
@ -1154,11 +1189,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
//
}
else
{
vGatherResult[1] = VUNDEF_I();
vGatherResult2[1] = VUNDEF_I();
}
// if we have at least one component to shuffle into place
if (compMask)
{
#if USE_SIMD16_BUILDER
Value *gatherResult[2];
gatherResult[0] = VUNDEF2_I();
gatherResult[1] = VUNDEF2_I();
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
// Shuffle gathered components into place in simdvertex struct
Shuffle16bpcGather2(args); // outputs to vVertexElements ref
#else
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
@ -1167,6 +1226,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// Shuffle gathered components into place in simdvertex struct
Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
#endif
}
#else
Value* vGatherResult[2];
@ -1209,12 +1269,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
break;
case 32:
{
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
Value *pVtxSrc2[4];
#endif
#endif
for (uint32_t i = 0; i < 4; i += 1)
{
#if USE_SIMD16_GATHERS
@ -1231,10 +1285,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
#if USE_SIMD16_BUILDER
Value *src = VUNDEF2_F();
src = INSERT2_F(src, gatherSrc, 0);
src = INSERT2_F(src, gatherSrc2, 1);
Value *indices = VUNDEF2_I();
indices = INSERT2_I(indices, vShiftedOffsets, 0);
indices = INSERT2_I(indices, vShiftedOffsets2, 1);
@ -1243,12 +1293,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
mask = INSERT2_I(mask, vGatherMask, 0);
mask = INSERT2_I(mask, vGatherMask2, 1);
pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2);
#if 1
vVertexElements[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0);
vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1);
#endif
pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
#else
vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
@ -1384,24 +1429,45 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
vVertexElements[currentVertexElement] = pGather;
#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
pVtxSrc2[currentVertexElement] = VUNDEF2_F();
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
#else
vVertexElements[currentVertexElement] = pGather;
vVertexElements2[currentVertexElement] = pGather2;
#endif
currentVertexElement += 1;
}
else
{
#if USE_SIMD16_BUILDER
pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
#else
vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
#endif
currentVertexElement += 1;
}
if (currentVertexElement > 3)
{
#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
#else
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
#endif
outputElt += 1;
// reset to the next vVertexElement to output
@ -1522,10 +1588,25 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
#if USE_SIMD16_GATHERS
Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
#if USE_SIMD16_BUILDER
Value *gatherResult = VUNDEF2_I();
gatherResult = INSERT2_I(gatherResult, vGatherResult, 0);
gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
// Shuffle gathered components into place in simdvertex struct
Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref
#else
Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
@ -1534,6 +1615,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// Shuffle gathered components into place in simdvertex struct
Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
#endif
#else
Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
// e.g. result of an 8x32bit integer gather for 8bit components
@ -1569,6 +1651,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
//
}
else
{
vGatherResult[0] = VUNDEF_I();
vGatherResult2[0] = VUNDEF_I();
}
// if we have at least one component out of z or w to fetch
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
@ -1583,10 +1670,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
//
}
else
{
vGatherResult[1] = VUNDEF_I();
vGatherResult2[1] = VUNDEF_I();
}
// if we have at least one component to shuffle into place
if (compMask)
{
#if USE_SIMD16_BUILDER
Value *gatherResult[2];
gatherResult[0] = VUNDEF2_I();
gatherResult[1] = VUNDEF2_I();
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
// Shuffle gathered components into place in simdvertex struct
Shuffle16bpcGather2(args); // outputs to vVertexElements ref
#else
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
@ -1595,6 +1707,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// Shuffle gathered components into place in simdvertex struct
Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
#endif
}
#else
Value* vGatherResult[2];
@ -1665,8 +1778,18 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
}
#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
pVtxSrc2[currentVertexElement] = VUNDEF2_F();
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
#else
vVertexElements[currentVertexElement] = pGather;
vVertexElements2[currentVertexElement] = pGather2;
#endif
// e.g. result of a single 8x32bit integer gather for 32bit components
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
@ -1698,9 +1821,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
{
#if USE_SIMD16_SHADERS
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
#else
vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
#endif
currentVertexElement += 1;
#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
@ -1713,9 +1841,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
if (currentVertexElement > 3)
{
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
#else
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
#endif
outputElt += 1;
#else
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
@ -1740,9 +1876,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
if (currentVertexElement > 0)
{
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
// store SIMD16s
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
#else
StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
#endif
outputElt += 1;
#else
StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
@ -2092,6 +2236,251 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
}
}
#if USE_SIMD16_BUILDER
void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
{
// Unpack tuple args
Value*& vGatherResult = std::get<0>(args);
Value* pVtxOut = std::get<1>(args);
const Instruction::CastOps extendType = std::get<2>(args);
const ConversionType conversionType = std::get<3>(args);
uint32_t &currentVertexElement = std::get<4>(args);
uint32_t &outputElt = std::get<5>(args);
const ComponentEnable compMask = std::get<6>(args);
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
Value* (&vVertexElements)[4] = std::get<8>(args);
const uint32_t(&swizzle)[4] = std::get<9>(args);
// cast types
Type *vGatherTy = mSimdInt32Ty;
Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
// have to do extra work for sign extending
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
{
Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask, including any swizzling
const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
char(y), char(y + 4), char(y + 8), char(y + 12),
char(z), char(z + 4), char(z + 8), char(z + 12),
char(w), char(w + 4), char(w + 8), char(w + 12),
char(x), char(x + 4), char(x + 8), char(x + 12),
char(y), char(y + 4), char(y + 8), char(y + 12),
char(z), char(z + 4), char(z + 8), char(z + 12),
char(w), char(w + 4), char(w + 8), char(w + 12) });
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
// after pshufb: group components together in each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
Value *vi128XY_lo = nullptr;
Value *vi128XY_hi = nullptr;
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
{
vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
}
// do the same for zw components
Value *vi128ZW_lo = nullptr;
Value *vi128ZW_hi = nullptr;
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
{
vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
}
// init denormalize variables if needed
Instruction::CastOps fpCast;
Value *conversionFactor;
switch (conversionType)
{
case CONVERT_NORMALIZED:
fpCast = Instruction::CastOps::SIToFP;
conversionFactor = VIMMED1((float)(1.0 / 127.0));
break;
case CONVERT_SSCALED:
fpCast = Instruction::CastOps::SIToFP;
conversionFactor = VIMMED1((float)(1.0));
break;
case CONVERT_USCALED:
SWR_INVALID("Type should not be sign extended!");
conversionFactor = nullptr;
break;
default:
SWR_ASSERT(conversionType == CONVERT_NONE);
conversionFactor = nullptr;
break;
}
// sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
for (uint32_t i = 0; i < 4; i++)
{
if (isComponentEnabled(compMask, i))
{
if (compCtrl[i] == ComponentControl::StoreSrc)
{
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
// if x or y, use vi128XY permute result, else use vi128ZW
Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
// sign extend
Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
// denormalize if needed
if (conversionType != CONVERT_NONE)
{
temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
vVertexElements[currentVertexElement] = VUNDEF2_F();
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
currentVertexElement += 1;
}
else
{
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
}
if (currentVertexElement > 3)
{
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
}
}
}
// else zero extend
else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
{
// init denormalize variables if needed
Instruction::CastOps fpCast;
Value *conversionFactor;
switch (conversionType)
{
case CONVERT_NORMALIZED:
fpCast = Instruction::CastOps::UIToFP;
conversionFactor = VIMMED1((float)(1.0 / 255.0));
break;
case CONVERT_USCALED:
fpCast = Instruction::CastOps::UIToFP;
conversionFactor = VIMMED1((float)(1.0));
break;
case CONVERT_SSCALED:
SWR_INVALID("Type should not be zero extended!");
conversionFactor = nullptr;
break;
default:
SWR_ASSERT(conversionType == CONVERT_NONE);
conversionFactor = nullptr;
break;
}
// shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
for (uint32_t i = 0; i < 4; i++)
{
if (isComponentEnabled(compMask, i))
{
if (compCtrl[i] == ComponentControl::StoreSrc)
{
// pshufb masks for each component
Value *vConstMask;
switch (swizzle[i])
{
case 0:
// x shuffle mask
vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
break;
case 1:
// y shuffle mask
vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
break;
case 2:
// z shuffle mask
vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
break;
case 3:
// w shuffle mask
vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
break;
default:
vConstMask = nullptr;
break;
}
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
// after pshufb for x channel
// 256i - 0 1 2 3 4 5 6 7
// x000 x000 x000 x000 x000 x000 x000 x000
// denormalize if needed
if (conversionType != CONVERT_NONE)
{
temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
vVertexElements[currentVertexElement] = VUNDEF2_F();
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
currentVertexElement += 1;
}
else
{
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
}
if (currentVertexElement > 3)
{
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
}
}
}
else
{
SWR_INVALID("Unsupported conversion type");
}
}
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
/// denormalizes if needed, converts to F32 if needed, and positions in
@ -2318,6 +2707,272 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
}
}
#if USE_SIMD16_BUILDER
void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
{
// Unpack tuple args
Value* (&vGatherResult)[2] = std::get<0>(args);
Value* pVtxOut = std::get<1>(args);
const Instruction::CastOps extendType = std::get<2>(args);
const ConversionType conversionType = std::get<3>(args);
uint32_t &currentVertexElement = std::get<4>(args);
uint32_t &outputElt = std::get<5>(args);
const ComponentEnable compMask = std::get<6>(args);
const ComponentControl(&compCtrl)[4] = std::get<7>(args);
Value* (&vVertexElements)[4] = std::get<8>(args);
// cast types
Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
// have to do extra work for sign extending
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
{
// is this PP float?
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
Value *vi128XY = nullptr;
Value *vi128XY_lo = nullptr;
Value *vi128XY_hi = nullptr;
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
{
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
// after pshufb: group components together in each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
#if 0
vi128XY = VUNDEF2_I();
vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
#endif
}
// do the same for zw components
Value *vi128ZW = nullptr;
Value *vi128ZW_lo = nullptr;
Value *vi128ZW_hi = nullptr;
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
{
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
#if 0
vi128ZW = VUNDEF2_I();
vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
#endif
}
// init denormalize variables if needed
Instruction::CastOps IntToFpCast;
Value *conversionFactor;
switch (conversionType)
{
case CONVERT_NORMALIZED:
IntToFpCast = Instruction::CastOps::SIToFP;
conversionFactor = VIMMED1((float)(1.0 / 32767.0));
break;
case CONVERT_SSCALED:
IntToFpCast = Instruction::CastOps::SIToFP;
conversionFactor = VIMMED1((float)(1.0));
break;
case CONVERT_USCALED:
SWR_INVALID("Type should not be sign extended!");
conversionFactor = nullptr;
break;
default:
SWR_ASSERT(conversionType == CONVERT_NONE);
conversionFactor = nullptr;
break;
}
// sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
for (uint32_t i = 0; i < 4; i++)
{
if (isComponentEnabled(compMask, i))
{
if (compCtrl[i] == ComponentControl::StoreSrc)
{
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
// if x or y, use vi128XY permute result, else use vi128ZW
Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
if (bFP)
{
// extract 128 bit lanes to sign extend each component
Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
vVertexElements[currentVertexElement] = VUNDEF2_F();
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
}
else
{
// extract 128 bit lanes to sign extend each component
Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
// denormalize if needed
if (conversionType != CONVERT_NONE)
{
temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
vVertexElements[currentVertexElement] = VUNDEF2_F();
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
}
currentVertexElement += 1;
}
else
{
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
}
if (currentVertexElement > 3)
{
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
}
}
}
// else zero extend
else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
{
// pshufb masks for each component
Value *vConstMask[2];
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
{
// x/z shuffle mask
vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
}
if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
{
// y/w shuffle mask
vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
}
// init denormalize variables if needed
Instruction::CastOps fpCast;
Value* conversionFactor;
switch (conversionType)
{
case CONVERT_NORMALIZED:
fpCast = Instruction::CastOps::UIToFP;
conversionFactor = VIMMED1((float)(1.0 / 65535.0));
break;
case CONVERT_USCALED:
fpCast = Instruction::CastOps::UIToFP;
conversionFactor = VIMMED1((float)(1.0f));
break;
case CONVERT_SSCALED:
SWR_INVALID("Type should not be zero extended!");
conversionFactor = nullptr;
break;
default:
SWR_ASSERT(conversionType == CONVERT_NONE);
conversionFactor = nullptr;
break;
}
// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
for (uint32_t i = 0; i < 4; i++)
{
if (isComponentEnabled(compMask, i))
{
if (compCtrl[i] == ComponentControl::StoreSrc)
{
// select correct constMask for x/z or y/w pshufb
uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
// if x or y, use vi128XY permute result, else use vi128ZW
uint32_t selectedGather = (i < 2) ? 0 : 1;
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
// after pshufb mask for x channel; z uses the same shuffle from the second gather
// 256i - 0 1 2 3 4 5 6 7
// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
// denormalize if needed
if (conversionType != CONVERT_NONE)
{
temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
vVertexElements[currentVertexElement] = VUNDEF2_F();
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
currentVertexElement += 1;
}
else
{
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
}
if (currentVertexElement > 3)
{
StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
// reset to the next vVertexElement to output
currentVertexElement = 0;
}
}
}
}
else
{
SWR_INVALID("Unsupported conversion type");
}
}
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Output a simdvertex worth of elements to the current outputElt
/// @param pVtxOut - base address of VIN output struct
@ -2438,7 +3093,14 @@ Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
case Store1Int: return VIMMED2_1(1);
case StoreVertexId:
{
Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty);
Value* pId = VUNDEF2_F();
Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
pId = INSERT2_F(pId, pId_lo, 0);
pId = INSERT2_F(pId, pId_hi, 1);
return VBROADCAST2(pId);
}
case StoreInstanceId: