swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
fbc27ff027
commit
df54678ba0
|
@ -44,9 +44,10 @@ inst_aliases = {
|
|||
intrinsics = [
|
||||
['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
|
||||
['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
|
||||
['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
|
||||
['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
|
||||
['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
|
||||
['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
|
||||
|
|
|
@ -639,7 +639,7 @@ namespace SwrJit
|
|||
}
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
|
||||
Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
|
||||
{
|
||||
Value *vGather = VUNDEF2_F();
|
||||
|
||||
|
@ -649,7 +649,7 @@ namespace SwrJit
|
|||
// force mask to <N-bit Integer>, required by vgather2
|
||||
Value *mask = BITCAST(vMask, mInt16Ty);
|
||||
|
||||
vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
|
||||
vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -659,8 +659,10 @@ namespace SwrJit
|
|||
Value *indices0 = EXTRACT2_I(vIndices, 0);
|
||||
Value *indices1 = EXTRACT2_I(vIndices, 1);
|
||||
|
||||
Value *mask0 = EXTRACT2_I(vMask, 0);
|
||||
Value *mask1 = EXTRACT2_I(vMask, 1);
|
||||
Value *vmask16 = VMASK2(vMask);
|
||||
|
||||
Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better..
|
||||
Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
|
||||
|
||||
Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
|
||||
Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
|
||||
|
@ -771,6 +773,37 @@ namespace SwrJit
|
|||
return vGather;
|
||||
}
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *Builder::PSRLI(Value *a, Value *imm)
|
||||
{
|
||||
return VPSRLI(a, imm);
|
||||
}
|
||||
|
||||
Value *Builder::PSRLI_16(Value *a, Value *imm)
|
||||
{
|
||||
Value *result = VUNDEF2_I();
|
||||
|
||||
// use avx512 shift right instruction if available
|
||||
if (JM()->mArch.AVX512F())
|
||||
{
|
||||
result = VPSRLI_16(a, imm);
|
||||
}
|
||||
else
|
||||
{
|
||||
Value *a0 = EXTRACT2_I(a, 0);
|
||||
Value *a1 = EXTRACT2_I(a, 1);
|
||||
|
||||
Value *result0 = PSRLI(a0, imm);
|
||||
Value *result1 = PSRLI(a1, imm);
|
||||
|
||||
result = INSERT2_I(result, result0, 0);
|
||||
result = INSERT2_I(result, result1, 1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif
|
||||
#if USE_SIMD16_BUILDER
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief
|
||||
|
|
|
@ -130,7 +130,7 @@ void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
|
|||
|
||||
Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *GATHERPS2(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
|
||||
Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
|
||||
#endif
|
||||
void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
|
||||
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
|
||||
|
@ -141,6 +141,11 @@ void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
|
|||
|
||||
Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *PSRLI(Value *a, Value *imm);
|
||||
Value *PSRLI_16(Value *a, Value *imm);
|
||||
|
||||
#endif
|
||||
void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
|
||||
|
||||
void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
|
||||
|
|
|
@ -839,7 +839,15 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
|
||||
Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
|
||||
Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
|
||||
Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value* vBaseVertex16 = VBROADCAST2(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
|
||||
#else
|
||||
Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
|
||||
#endif
|
||||
#else
|
||||
Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
|
||||
#endif
|
||||
curInstance->setName("curInstance");
|
||||
|
||||
for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
|
||||
|
@ -859,10 +867,18 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
|
||||
|
||||
// VGATHER* takes an *i8 src pointer
|
||||
Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
|
||||
Value *pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
|
||||
|
||||
Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *vStride16 = VBROADCAST2(stride);
|
||||
#else
|
||||
Value *vStride = VBROADCAST(stride);
|
||||
#endif
|
||||
#else
|
||||
Value *vStride = VBROADCAST(stride);
|
||||
#endif
|
||||
|
||||
// max vertex index that is fully in bounds
|
||||
Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
|
||||
|
@ -885,9 +901,20 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
Value *vCurIndices;
|
||||
#if USE_SIMD16_GATHERS
|
||||
Value *vCurIndices2;
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *vCurIndices16;
|
||||
#endif
|
||||
#endif
|
||||
Value *startOffset;
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *vInstanceStride16 = VIMMED2_1(0);
|
||||
#else
|
||||
Value *vInstanceStride = VIMMED1(0);
|
||||
#endif
|
||||
#else
|
||||
Value *vInstanceStride = VIMMED1(0);
|
||||
#endif
|
||||
|
||||
if (ied.InstanceEnable)
|
||||
{
|
||||
|
@ -903,10 +930,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// if step rate is 0, every instance gets instance 0
|
||||
calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
|
||||
|
||||
vCurIndices = VBROADCAST(calcInstance);
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
vCurIndices16 = VBROADCAST2(calcInstance);
|
||||
#else
|
||||
vCurIndices = VBROADCAST(calcInstance);
|
||||
vCurIndices2 = VBROADCAST(calcInstance);
|
||||
#endif
|
||||
#else
|
||||
vCurIndices = VBROADCAST(calcInstance);
|
||||
#endif
|
||||
|
||||
startOffset = startInstance;
|
||||
}
|
||||
|
@ -914,13 +947,32 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
{
|
||||
// grab the instance advancement state, determines stride in bytes from one instance to the next
|
||||
Value* stepRate = C(ied.InstanceAdvancementState);
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
vInstanceStride16 = VBROADCAST2(MUL(curInstance, stepRate));
|
||||
#else
|
||||
vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
|
||||
#endif
|
||||
#else
|
||||
vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
|
||||
#endif
|
||||
|
||||
// offset indices by baseVertex
|
||||
vCurIndices = ADD(vIndices, vBaseVertex);
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *vIndices16 = VUNDEF2_I();
|
||||
|
||||
vIndices16 = INSERT2_I(vIndices16, vIndices, 0);
|
||||
vIndices16 = INSERT2_I(vIndices16, vIndices2, 1);
|
||||
|
||||
vCurIndices16 = ADD(vIndices16, vBaseVertex16);
|
||||
#else
|
||||
vCurIndices = ADD(vIndices, vBaseVertex);
|
||||
vCurIndices2 = ADD(vIndices2, vBaseVertex);
|
||||
#endif
|
||||
#else
|
||||
vCurIndices = ADD(vIndices, vBaseVertex);
|
||||
#endif
|
||||
|
||||
startOffset = startVertex;
|
||||
SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
|
||||
|
@ -928,10 +980,21 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
else
|
||||
{
|
||||
// offset indices by baseVertex
|
||||
vCurIndices = ADD(vIndices, vBaseVertex);
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *vIndices16 = VUNDEF2_I();
|
||||
|
||||
vIndices16 = INSERT2_I(vIndices16, vIndices, 0);
|
||||
vIndices16 = INSERT2_I(vIndices16, vIndices2, 1);
|
||||
|
||||
vCurIndices16 = ADD(vIndices16, vBaseVertex16);
|
||||
#else
|
||||
vCurIndices = ADD(vIndices, vBaseVertex);
|
||||
vCurIndices2 = ADD(vIndices2, vBaseVertex);
|
||||
#endif
|
||||
#else
|
||||
vCurIndices = ADD(vIndices, vBaseVertex);
|
||||
#endif
|
||||
|
||||
startOffset = startVertex;
|
||||
}
|
||||
|
@ -960,14 +1023,76 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// Load the in bounds size of a partially valid vertex
|
||||
Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
|
||||
partialInboundsSize = LOAD(partialInboundsSize);
|
||||
Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
|
||||
Value* vBpp = VBROADCAST(C(info.Bpp));
|
||||
Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *vPartialVertexSize = VBROADCAST2(partialInboundsSize);
|
||||
Value *vBpp = VBROADCAST2(C(info.Bpp));
|
||||
Value *vAlignmentOffsets = VBROADCAST2(C(ied.AlignedByteOffset));
|
||||
#else
|
||||
Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
|
||||
Value *vBpp = VBROADCAST(C(info.Bpp));
|
||||
Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
|
||||
#endif
|
||||
#else
|
||||
Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
|
||||
Value *vBpp = VBROADCAST(C(info.Bpp));
|
||||
Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
|
||||
#endif
|
||||
|
||||
// is the element is <= the partially valid size
|
||||
Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
|
||||
Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
|
||||
|
||||
#if USE_SIMD16_GATHERS
|
||||
#if USE_SIMD16_BUILDER
|
||||
// override cur indices with 0 if pitch is 0
|
||||
Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED2_1(0));
|
||||
vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED2_1(0), vCurIndices16);
|
||||
|
||||
// are vertices partially OOB?
|
||||
Value *vMaxVertex16 = VBROADCAST2(maxVertex);
|
||||
Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16);
|
||||
|
||||
// are vertices fully in bounds?
|
||||
Value *vMaxGatherMask16 = ICMP_ULT(vCurIndices16, vMaxVertex16);
|
||||
|
||||
Value *vGatherMask16;
|
||||
|
||||
if (fetchState.bPartialVertexBuffer)
|
||||
{
|
||||
// are vertices below minVertex limit?
|
||||
Value *vMinVertex16 = VBROADCAST2(minVertex);
|
||||
Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16);
|
||||
|
||||
// only fetch lanes that pass both tests
|
||||
vGatherMask16 = AND(vMaxGatherMask16, vMinGatherMask16);
|
||||
}
|
||||
else
|
||||
{
|
||||
vGatherMask16 = vMaxGatherMask16;
|
||||
}
|
||||
|
||||
// blend in any partially OOB indices that have valid elements
|
||||
vGatherMask16 = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask16);
|
||||
|
||||
// calculate the actual offsets into the VB
|
||||
Value *vOffsets16 = MUL(vCurIndices16, vStride16);
|
||||
vOffsets16 = ADD(vOffsets16, vAlignmentOffsets);
|
||||
|
||||
// if instance stride enable is:
|
||||
// true - add product of the instanceID and advancement state to the offst into the VB
|
||||
// false - value of vInstanceStride has been initialialized to zero
|
||||
vOffsets16 = ADD(vOffsets16, vInstanceStride16);
|
||||
|
||||
// TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16..
|
||||
Value *vmask16 = VMASK2(vGatherMask16);
|
||||
|
||||
Value *vGatherMask = MASK(EXTRACT2_I(vmask16, 0));
|
||||
Value *vGatherMask2 = MASK(EXTRACT2_I(vmask16, 1));
|
||||
|
||||
Value *vOffsets = EXTRACT2_I(vOffsets16, 0);
|
||||
Value *vOffsets2 = EXTRACT2_I(vOffsets16, 1);
|
||||
|
||||
#else
|
||||
// override cur indices with 0 if pitch is 0
|
||||
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
|
||||
vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
|
||||
|
@ -1018,6 +1143,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
vOffsets = ADD(vOffsets, vInstanceStride);
|
||||
vOffsets2 = ADD(vOffsets2, vInstanceStride);
|
||||
|
||||
#endif
|
||||
#else
|
||||
// override cur indices with 0 if pitch is 0
|
||||
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
|
||||
|
@ -1276,17 +1402,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
|
||||
// But, we know that elements must be aligned for FETCH. :)
|
||||
// Right shift the offset by a bit and then scale by 2 to remove the sign extension.
|
||||
Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
|
||||
Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
|
||||
#if USE_SIMD16_BUILDER
|
||||
Value *indices = VUNDEF2_I();
|
||||
indices = INSERT2_I(indices, vShiftedOffsets, 0);
|
||||
indices = INSERT2_I(indices, vShiftedOffsets2, 1);
|
||||
Value *shiftedOffsets = VPSRLI_16(vOffsets16, C(1));
|
||||
pVtxSrc2[currentVertexElement] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets, vGatherMask16, 2);
|
||||
|
||||
Value *mask = VSHUFFLE(vGatherMask, vGatherMask2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
|
||||
|
||||
pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
|
||||
#else
|
||||
Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
|
||||
Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
|
||||
|
||||
vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
|
||||
vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
|
||||
|
||||
|
@ -1388,29 +1511,29 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
|
|||
// if we need to gather the component
|
||||
if (compCtrl[i] == StoreSrc)
|
||||
{
|
||||
Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
|
||||
Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
|
||||
Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
|
||||
Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
|
||||
Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
|
||||
Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
|
||||
|
||||
Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
|
||||
Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
|
||||
Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
|
||||
Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
|
||||
Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
|
||||
Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
|
||||
|
||||
Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
|
||||
|
||||
Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
|
||||
Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
|
||||
Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
|
||||
Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
|
||||
Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
|
||||
Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
|
||||
|
||||
pGatherLo = VCVTPD2PS(pGatherLo);
|
||||
pGatherLo = VCVTPD2PS(pGatherLo);
|
||||
pGatherLo2 = VCVTPD2PS(pGatherLo2);
|
||||
pGatherHi = VCVTPD2PS(pGatherHi);
|
||||
pGatherHi = VCVTPD2PS(pGatherHi);
|
||||
pGatherHi2 = VCVTPD2PS(pGatherHi2);
|
||||
|
||||
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
|
||||
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
|
||||
Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
|
||||
|
||||
#if USE_SIMD16_BUILDER
|
||||
|
|
Loading…
Reference in New Issue