swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components
Also widen the 16-bit a 8-bit integer vertex component gathers to SIMD16. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
fa3105cdb5
commit
01a57c11cb
|
@ -46,6 +46,7 @@ intrinsics = [
|
|||
['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
|
||||
['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
|
||||
['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
|
||||
|
|
|
@ -723,6 +723,42 @@ namespace SwrJit
|
|||
return vGather;
|
||||
}
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
|
||||
{
|
||||
Value *vGather = VUNDEF2_F();
|
||||
|
||||
// use avx512 gather instruction if available
|
||||
if (JM()->mArch.AVX512F())
|
||||
{
|
||||
// force mask to <N-bit Integer>, required by vgather2
|
||||
Value *mask = BITCAST(vMask, mInt16Ty);
|
||||
|
||||
vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
|
||||
}
|
||||
else
|
||||
{
|
||||
Value *src0 = EXTRACT2_F(vSrc, 0);
|
||||
Value *src1 = EXTRACT2_F(vSrc, 1);
|
||||
|
||||
Value *indices0 = EXTRACT2_I(vIndices, 0);
|
||||
Value *indices1 = EXTRACT2_I(vIndices, 1);
|
||||
|
||||
Value *vmask16 = VMASK2(vMask);
|
||||
|
||||
Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better..
|
||||
Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
|
||||
|
||||
Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
|
||||
Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
|
||||
|
||||
vGather = JOIN2(gather0, gather1);
|
||||
}
|
||||
|
||||
return vGather;
|
||||
}
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Generate a masked gather operation in LLVM IR. If not
|
||||
/// supported on the underlying platform, emulate it with loads
|
||||
|
|
|
@ -135,6 +135,9 @@ void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
|
|||
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
|
||||
|
||||
Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
|
||||
#endif
|
||||
void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
|
||||
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
|
||||
|
||||
|
|
|
@ -1349,14 +1349,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
if (compMask)
|
||||
{
|
||||
#if USE_SIMD16_BUILDER
|
||||
#if USE_SIMD16_BUILDER
|
||||
#else
|
||||
Value *gatherResult[2];
|
||||
|
||||
gatherResult[0] = JOIN2(vGatherResult[0], vGatherResult2[0]);
|
||||
gatherResult[1] = JOIN2(vGatherResult[1], vGatherResult2[1]);
|
||||
|
||||
#endif
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
|
||||
|
@ -1701,6 +1693,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
Value* gatherSrc = VIMMED1(0);
|
||||
#if USE_SIMD16_GATHERS
|
||||
Value* gatherSrc2 = VIMMED1(0);
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherSrc16 = VIMMED2_1(0);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Gather components from memory to store in a simdvertex structure
|
||||
|
@ -1712,6 +1707,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
if (compMask)
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
|
||||
|
||||
// e.g. result of an 8x32bit integer gather for 8bit components
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
|
||||
|
||||
#else
|
||||
Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
|
||||
Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
|
||||
|
||||
|
@ -1719,9 +1722,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
|
||||
|
||||
#endif
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherResult = JOIN2(vGatherResult, vGatherResult2);
|
||||
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
|
||||
|
@ -1761,6 +1763,43 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
case 16:
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value* gatherResult[2];
|
||||
|
||||
// if we have at least one component out of x or y to fetch
|
||||
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
|
||||
{
|
||||
gatherResult[0] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
|
||||
|
||||
// e.g. result of first 8x32bit integer gather for 16bit components
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
|
||||
//
|
||||
}
|
||||
else
|
||||
{
|
||||
gatherResult[0] = VUNDEF2_I();
|
||||
}
|
||||
|
||||
// if we have at least one component out of z or w to fetch
|
||||
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
|
||||
{
|
||||
// offset base to the next components(zw) in the vertex to gather
|
||||
pStreamBase = GEP(pStreamBase, C((char)4));
|
||||
|
||||
gatherResult[1] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
|
||||
|
||||
// e.g. result of second 8x32bit integer gather for 16bit components
|
||||
// 256i - 0 1 2 3 4 5 6 7
|
||||
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
|
||||
//
|
||||
}
|
||||
else
|
||||
{
|
||||
gatherResult[1] = VUNDEF2_I();
|
||||
}
|
||||
|
||||
#else
|
||||
Value* vGatherResult[2];
|
||||
Value* vGatherResult2[2];
|
||||
|
||||
|
@ -1799,15 +1838,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
vGatherResult2[1] = VUNDEF_I();
|
||||
}
|
||||
|
||||
#endif
|
||||
// if we have at least one component to shuffle into place
|
||||
if (compMask)
|
||||
{
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *gatherResult[2];
|
||||
|
||||
gatherResult[0] = JOIN2(vGatherResult[0], vGatherResult2[0]);
|
||||
gatherResult[1] = JOIN2(vGatherResult[1], vGatherResult2[1]);
|
||||
|
||||
Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
|
||||
|
||||
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
|
||||
|
@ -1876,6 +1911,23 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
if (compCtrl[i] == StoreSrc)
|
||||
{
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
|
||||
|
||||
if (conversionType == CONVERT_USCALED)
|
||||
{
|
||||
pGather = UI_TO_FP(pGather, mSimd2FP32Ty);
|
||||
}
|
||||
else if (conversionType == CONVERT_SSCALED)
|
||||
{
|
||||
pGather = SI_TO_FP(pGather, mSimd2FP32Ty);
|
||||
}
|
||||
else if (conversionType == CONVERT_SFIXED)
|
||||
{
|
||||
pGather = FMUL(SI_TO_FP(pGather, mSimd2FP32Ty), VBROADCAST2(C(1 / 65536.0f)));
|
||||
}
|
||||
|
||||
#else
|
||||
Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
|
||||
Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
|
||||
|
||||
|
@ -1895,9 +1947,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
|
||||
}
|
||||
|
||||
#endif
|
||||
#if USE_SIMD16_BUILDER
|
||||
// pack adjacent pairs of SIMD8s into SIMD16s
|
||||
pVtxSrc2[currentVertexElement] = JOIN2(pGather, pGather2);
|
||||
pVtxSrc2[currentVertexElement] = pGather;
|
||||
|
||||
#else
|
||||
vVertexElements[currentVertexElement] = pGather;
|
||||
|
|
Loading…
Reference in New Issue