swr/rast: Remove deprecated 4x2 backend code
- Use 8x2 tiling by default - Remove associated macros - Use SIMDLIB emulation for SIMD16 on SIMD8 hardware - Remove code rot in Load/StoreTile Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
e8bf4efceb
commit
0bf1df2bb6
|
@ -37,29 +37,11 @@
|
|||
|
||||
#include <algorithm>
|
||||
|
||||
template <SWR_FORMAT format>
|
||||
void ClearRasterTile(uint8_t* pTileBuffer, simdvector& value)
|
||||
{
|
||||
auto lambda = [&](int32_t comp) {
|
||||
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
|
||||
|
||||
pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
|
||||
};
|
||||
|
||||
const uint32_t numIter =
|
||||
(KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
|
||||
|
||||
for (uint32_t i = 0; i < numIter; ++i)
|
||||
{
|
||||
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template <SWR_FORMAT format>
|
||||
void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
|
||||
{
|
||||
auto lambda = [&](int32_t comp) {
|
||||
auto lambda = [&](int32_t comp)
|
||||
{
|
||||
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
|
||||
|
||||
pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
|
||||
|
@ -74,7 +56,6 @@ void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
|
|||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
template <SWR_FORMAT format>
|
||||
INLINE void ClearMacroTile(DRAW_CONTEXT* pDC,
|
||||
HANDLE hWorkerPrivateData,
|
||||
|
@ -86,37 +67,22 @@ INLINE void ClearMacroTile(DRAW_CONTEXT* pDC,
|
|||
{
|
||||
// convert clear color to hottile format
|
||||
// clear color is in RGBA float/uint32
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
simd16vector vClear;
|
||||
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
|
||||
{
|
||||
simd16scalar vComp;
|
||||
vComp = _simd16_load1_ps((const float*)&clear[comp]);
|
||||
simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]);
|
||||
|
||||
if (FormatTraits<format>::isNormalized(comp))
|
||||
{
|
||||
vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
|
||||
vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
|
||||
}
|
||||
vComp = FormatTraits<format>::pack(comp, vComp);
|
||||
vComp = FormatTraits<format>::pack(comp, vComp);
|
||||
|
||||
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
|
||||
}
|
||||
|
||||
#else
|
||||
simdvector vClear;
|
||||
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
|
||||
{
|
||||
simdscalar vComp;
|
||||
vComp = _simd_load1_ps((const float*)&clear[comp]);
|
||||
if (FormatTraits<format>::isNormalized(comp))
|
||||
{
|
||||
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
|
||||
vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
|
||||
}
|
||||
vComp = FormatTraits<format>::pack(comp, vComp);
|
||||
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
|
||||
}
|
||||
|
||||
#endif
|
||||
uint32_t tileX, tileY;
|
||||
MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
|
||||
|
||||
|
|
|
@ -894,87 +894,6 @@ static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
|
|||
psContext.vJ.sample);
|
||||
}
|
||||
|
||||
// Merge Output to 4x2 SIMD Tile Format
|
||||
INLINE void OutputMerger4x2(DRAW_CONTEXT* pDC,
|
||||
SWR_PS_CONTEXT& psContext,
|
||||
uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
|
||||
uint32_t sample,
|
||||
const SWR_BLEND_STATE* pBlendState,
|
||||
const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
|
||||
simdscalar& coverageMask,
|
||||
simdscalar const& depthPassMask,
|
||||
uint32_t renderTargetMask,
|
||||
uint32_t workerId)
|
||||
{
|
||||
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
|
||||
const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
|
||||
simdvector blendOut;
|
||||
|
||||
DWORD rt = 0;
|
||||
while (_BitScanForward(&rt, renderTargetMask))
|
||||
{
|
||||
renderTargetMask &= ~(1 << rt);
|
||||
uint8_t* pColorSample = pColorBase[rt] + rasterTileColorOffset;
|
||||
|
||||
const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
|
||||
|
||||
SWR_BLEND_CONTEXT blendContext = {0};
|
||||
{
|
||||
// pfnBlendFunc may not update all channels. Initialize with PS output.
|
||||
/// TODO: move this into the blend JIT.
|
||||
blendOut = psContext.shaded[rt];
|
||||
|
||||
blendContext.pBlendState = pBlendState;
|
||||
blendContext.src = &psContext.shaded[rt];
|
||||
blendContext.src1 = &psContext.shaded[1];
|
||||
blendContext.src0alpha = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
|
||||
blendContext.sampleNum = sample;
|
||||
blendContext.pDst = (simdvector*)&pColorSample;
|
||||
blendContext.result = &blendOut;
|
||||
blendContext.oMask = &psContext.oMask;
|
||||
blendContext.pMask = reinterpret_cast<simdscalari*>(&coverageMask);
|
||||
|
||||
// Blend outputs and update coverage mask for alpha test
|
||||
if (pfnBlendFunc[rt] != nullptr)
|
||||
{
|
||||
pfnBlendFunc[rt](&blendContext);
|
||||
}
|
||||
}
|
||||
|
||||
// Track alpha events
|
||||
AR_EVENT(
|
||||
AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
|
||||
|
||||
// final write mask
|
||||
simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
|
||||
|
||||
///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
|
||||
static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
|
||||
"Unsupported hot tile format");
|
||||
|
||||
const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
|
||||
|
||||
// store with color mask
|
||||
if (!pRTBlend->writeDisableRed)
|
||||
{
|
||||
_simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
|
||||
}
|
||||
if (!pRTBlend->writeDisableGreen)
|
||||
{
|
||||
_simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
|
||||
}
|
||||
if (!pRTBlend->writeDisableBlue)
|
||||
{
|
||||
_simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
|
||||
}
|
||||
if (!pRTBlend->writeDisableAlpha)
|
||||
{
|
||||
_simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
// Merge Output to 8x2 SIMD16 Tile Format
|
||||
INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
|
||||
SWR_PS_CONTEXT& psContext,
|
||||
|
@ -1076,8 +995,6 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
|
|||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void BackendPixelRate(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
|
@ -1137,9 +1054,9 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
|
|||
|
||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
||||
#endif
|
||||
|
||||
|
||||
simdscalar activeLanes;
|
||||
if (!(work.anyCoveredSamples & MASK))
|
||||
{
|
||||
|
@ -1264,7 +1181,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
|
|||
}
|
||||
|
||||
// broadcast the results of the PS to all passing pixels
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
OutputMerger8x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
|
@ -1276,18 +1193,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
|
|||
state.psState.renderTargetMask,
|
||||
useAlternateOffset,
|
||||
workerId);
|
||||
#else // USE_8x2_TILE_BACKEND
|
||||
OutputMerger4x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
sample,
|
||||
&state.blendState,
|
||||
state.pfnBlendFunc,
|
||||
coverageMask,
|
||||
depthMask,
|
||||
state.psState.renderTargetMask,
|
||||
workerId);
|
||||
#endif // USE_8x2_TILE_BACKEND
|
||||
|
||||
|
||||
if (!state.psState.forceEarlyZ && !T::bForcedSampleCount)
|
||||
{
|
||||
|
@ -1320,7 +1226,6 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
|
|||
}
|
||||
work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
DWORD rt;
|
||||
|
@ -1332,16 +1237,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
|
|||
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
}
|
||||
#else
|
||||
DWORD rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
rtMask &= ~(1 << rt);
|
||||
psContext.pColorBuffer[rt] +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
#endif
|
||||
|
||||
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
pStencilBuffer +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
|
|
|
@ -81,9 +81,9 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
|
|||
|
||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
||||
#endif
|
||||
|
||||
|
||||
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
||||
{
|
||||
const uint64_t* pCoverageMask =
|
||||
|
@ -252,7 +252,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
|
|||
|
||||
// output merger
|
||||
RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
OutputMerger8x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
|
@ -264,18 +264,6 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
|
|||
state.psState.renderTargetMask,
|
||||
useAlternateOffset,
|
||||
workerId);
|
||||
#else
|
||||
OutputMerger4x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
sample,
|
||||
&state.blendState,
|
||||
state.pfnBlendFunc,
|
||||
vCoverageMask,
|
||||
depthPassMask,
|
||||
state.psState.renderTargetMask,
|
||||
workerId);
|
||||
#endif
|
||||
|
||||
// do final depth write after all pixel kills
|
||||
if (!state.psState.forceEarlyZ)
|
||||
|
@ -305,7 +293,6 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
|
|||
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
}
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
DWORD rt;
|
||||
|
@ -317,16 +304,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
|
|||
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
}
|
||||
#else
|
||||
DWORD rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
rtMask &= ~(1 << rt);
|
||||
psContext.pColorBuffer[rt] +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
#endif
|
||||
|
||||
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
pStencilBuffer +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
|
|
|
@ -82,9 +82,9 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
|
|||
|
||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
||||
#endif
|
||||
|
||||
|
||||
simdmask coverageMask = work.coverageMask[0] & MASK;
|
||||
|
||||
if (coverageMask)
|
||||
|
@ -237,7 +237,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
|
|||
|
||||
// output merger
|
||||
RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
OutputMerger8x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
|
@ -249,19 +249,6 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
|
|||
state.psState.renderTargetMask,
|
||||
useAlternateOffset,
|
||||
workerId);
|
||||
#else
|
||||
OutputMerger4x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
0,
|
||||
&state.blendState,
|
||||
state.pfnBlendFunc,
|
||||
vCoverageMask,
|
||||
depthPassMask,
|
||||
state.psState.renderTargetMask,
|
||||
workerId,
|
||||
workerId);
|
||||
#endif
|
||||
|
||||
// do final depth write after all pixel kills
|
||||
if (!state.psState.forceEarlyZ)
|
||||
|
@ -288,7 +275,6 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
|
|||
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
}
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
DWORD rt;
|
||||
|
@ -300,16 +286,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
|
|||
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
}
|
||||
#else
|
||||
DWORD rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
rtMask &= ~(1 << rt);
|
||||
psContext.pColorBuffer[rt] +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
#endif
|
||||
|
||||
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
pStencilBuffer +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
|
|
|
@ -39,7 +39,6 @@
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ENABLE_AVX512_SIMD16 1
|
||||
#define USE_8x2_TILE_BACKEND 1
|
||||
#define USE_SIMD16_FRONTEND 1
|
||||
#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND
|
||||
#define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS
|
||||
|
|
|
@ -244,7 +244,6 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext,
|
|||
return &hotTile;
|
||||
}
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
void HotTileMgr::ClearColorHotTile(
|
||||
const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
|
||||
{
|
||||
|
@ -330,91 +329,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
|
|||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void HotTileMgr::ClearColorHotTile(
|
||||
const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
|
||||
{
|
||||
// Load clear color into SIMD register...
|
||||
float* pClearData = (float*)(pHotTile->clearData);
|
||||
simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
|
||||
simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
|
||||
simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
|
||||
simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
|
||||
|
||||
float* pfBuf = (float*)pHotTile->pBuffer;
|
||||
uint32_t numSamples = pHotTile->numSamples;
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
|
||||
{
|
||||
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
|
||||
{
|
||||
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
|
||||
si +=
|
||||
SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) // SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
|
||||
{
|
||||
_simd_store_ps(pfBuf, valR);
|
||||
pfBuf += KNOB_SIMD_WIDTH;
|
||||
_simd_store_ps(pfBuf, valG);
|
||||
pfBuf += KNOB_SIMD_WIDTH;
|
||||
_simd_store_ps(pfBuf, valB);
|
||||
pfBuf += KNOB_SIMD_WIDTH;
|
||||
_simd_store_ps(pfBuf, valA);
|
||||
pfBuf += KNOB_SIMD_WIDTH;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HotTileMgr::ClearDepthHotTile(
|
||||
const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
|
||||
{
|
||||
// Load clear color into SIMD register...
|
||||
float* pClearData = (float*)(pHotTile->clearData);
|
||||
simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
|
||||
|
||||
float* pfBuf = (float*)pHotTile->pBuffer;
|
||||
uint32_t numSamples = pHotTile->numSamples;
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
|
||||
{
|
||||
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
|
||||
{
|
||||
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
|
||||
si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
|
||||
{
|
||||
_simd_store_ps(pfBuf, valZ);
|
||||
pfBuf += KNOB_SIMD_WIDTH;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
|
||||
{
|
||||
// convert from F32 to U8.
|
||||
uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
|
||||
// broadcast 32x into __m256i...
|
||||
simdscalari valS = _simd_set1_epi8(clearVal);
|
||||
|
||||
simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
|
||||
uint32_t numSamples = pHotTile->numSamples;
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
|
||||
{
|
||||
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
|
||||
{
|
||||
// We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
|
||||
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
|
||||
si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
|
||||
{
|
||||
_simd_store_si(pBuf, valS);
|
||||
pBuf += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief InitializeHotTiles
|
||||
/// for draw calls, we initialize the active hot tiles and perform deferred
|
||||
|
|
|
@ -67,7 +67,6 @@ struct LoadRasterTile
|
|||
uint32_t x, uint32_t y,
|
||||
uint8_t* pDst)
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
typedef SimdTile_16<DstFormat, SrcFormat> SimdT;
|
||||
|
||||
SimdT* pDstSimdTiles = (SimdT*)pDst;
|
||||
|
@ -81,21 +80,6 @@ struct LoadRasterTile
|
|||
uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
|
||||
|
||||
pSimdTile->SetSwizzledColor(simdOffset, srcColor);
|
||||
#else
|
||||
typedef SimdTile<DstFormat, SrcFormat> SimdT;
|
||||
|
||||
SimdT* pDstSimdTiles = (SimdT*)pDst;
|
||||
|
||||
// Compute which simd tile we're accessing within 8x8 tile.
|
||||
// i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
|
||||
uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
|
||||
|
||||
SimdT* pSimdTile = &pDstSimdTiles[simdIndex];
|
||||
|
||||
uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
|
||||
|
||||
pSimdTile->SetSwizzledColor(simdOffset, srcColor);
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -104,7 +104,6 @@ struct StorePixels<8, 2>
|
|||
}
|
||||
};
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template <>
|
||||
struct StorePixels<8, 4>
|
||||
{
|
||||
|
@ -130,7 +129,6 @@ struct StorePixels<8, 4>
|
|||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// StorePixels (32-bit pixel specialization)
|
||||
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
|
||||
|
@ -159,7 +157,6 @@ struct StorePixels<16, 2>
|
|||
}
|
||||
};
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template <>
|
||||
struct StorePixels<16, 4>
|
||||
{
|
||||
|
@ -185,7 +182,6 @@ struct StorePixels<16, 4>
|
|||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// StorePixels (32-bit pixel specialization)
|
||||
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
|
||||
|
@ -213,7 +209,6 @@ struct StorePixels<32, 2>
|
|||
}
|
||||
};
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template <>
|
||||
struct StorePixels<32, 4>
|
||||
{
|
||||
|
@ -237,7 +232,6 @@ struct StorePixels<32, 4>
|
|||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// StorePixels (32-bit pixel specialization)
|
||||
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
|
||||
|
@ -264,7 +258,6 @@ struct StorePixels<64, 4>
|
|||
}
|
||||
};
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template <>
|
||||
struct StorePixels<64, 8>
|
||||
{
|
||||
|
@ -287,7 +280,6 @@ struct StorePixels<64, 8>
|
|||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// StorePixels (32-bit pixel specialization)
|
||||
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
|
||||
|
@ -318,7 +310,6 @@ struct StorePixels<128, 8>
|
|||
}
|
||||
};
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template <>
|
||||
struct StorePixels<128, 16>
|
||||
{
|
||||
|
@ -339,7 +330,6 @@ struct StorePixels<128, 16>
|
|||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -354,7 +344,6 @@ struct ConvertPixelsSOAtoAOS
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
|
||||
|
||||
OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
|
||||
|
@ -368,21 +357,6 @@ struct ConvertPixelsSOAtoAOS
|
|||
// Convert from SOA --> AOS
|
||||
FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
|
||||
|
||||
#else
|
||||
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
|
||||
|
||||
OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
|
||||
OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
|
||||
|
||||
// Convert from SrcFormat --> DstFormat
|
||||
simdvector src;
|
||||
LoadSOA<SrcFormat>(pSrc, src);
|
||||
StoreSOA<DstFormat>(src, soaTile);
|
||||
|
||||
// Convert from SOA --> AOS
|
||||
FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
|
||||
|
||||
#endif
|
||||
// Store data into destination
|
||||
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
|
||||
}
|
||||
|
@ -403,7 +377,6 @@ struct ConvertPixelsSOAtoAOS<Format, Format>
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
|
||||
|
||||
OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
|
||||
|
@ -411,15 +384,6 @@ struct ConvertPixelsSOAtoAOS<Format, Format>
|
|||
// Convert from SOA --> AOS
|
||||
FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
|
||||
|
||||
#else
|
||||
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
|
||||
|
||||
OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
|
||||
|
||||
// Convert from SOA --> AOS
|
||||
FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
|
||||
|
||||
#endif
|
||||
// Store data into destination
|
||||
StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
|
||||
}
|
||||
|
@ -439,7 +403,6 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
|
||||
static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
|
||||
|
||||
|
@ -483,47 +446,6 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
|
|||
*pAosTile++ = *pPacked++;
|
||||
}
|
||||
|
||||
#else
|
||||
static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
|
||||
static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
|
||||
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
|
||||
|
||||
OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
|
||||
|
||||
// Load hot-tile
|
||||
simdvector src, dst;
|
||||
LoadSOA<SrcFormat>(pSrc, src);
|
||||
|
||||
// deswizzle
|
||||
dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
|
||||
dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
|
||||
dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
|
||||
|
||||
// clamp
|
||||
dst.x = Clamp<DstFormat>(dst.x, 0);
|
||||
dst.y = Clamp<DstFormat>(dst.y, 1);
|
||||
dst.z = Clamp<DstFormat>(dst.z, 2);
|
||||
|
||||
// normalize
|
||||
dst.x = Normalize<DstFormat>(dst.x, 0);
|
||||
dst.y = Normalize<DstFormat>(dst.y, 1);
|
||||
dst.z = Normalize<DstFormat>(dst.z, 2);
|
||||
|
||||
// pack
|
||||
simdscalari packed = _simd_castps_si(dst.x);
|
||||
packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
|
||||
packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
|
||||
FormatTraits<DstFormat>::GetConstBPC(1)));
|
||||
|
||||
// pack low 16 bits of each 32 bit lane to low 128 bits of dst
|
||||
uint32_t *pPacked = (uint32_t*)&packed;
|
||||
uint16_t *pAosTile = (uint16_t*)&aosTile[0];
|
||||
for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
|
||||
{
|
||||
*pAosTile++ = *pPacked++;
|
||||
}
|
||||
|
||||
#endif
|
||||
// Store data into destination
|
||||
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
|
||||
}
|
||||
|
@ -546,7 +468,6 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
|
||||
|
||||
// clamp
|
||||
|
@ -579,46 +500,9 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
|
|||
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
|
||||
#else
|
||||
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
|
||||
|
||||
OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
|
||||
OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
|
||||
|
||||
// Convert from SrcFormat --> DstFormat
|
||||
simdvector src;
|
||||
LoadSOA<SrcFormat>(pSrc, src);
|
||||
StoreSOA<DstFormat>(src, soaTile);
|
||||
|
||||
// Convert from SOA --> AOS
|
||||
FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
|
||||
|
||||
// Store data into destination but don't overwrite the X8 bits
|
||||
// Each 4-pixel row is 16-bytes
|
||||
simd4scalari *pZRow01 = (simd4scalari*)aosTile;
|
||||
simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
|
||||
simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
|
||||
|
||||
simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
|
||||
simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
|
||||
|
||||
simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
|
||||
simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
|
||||
|
||||
simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
|
||||
|
||||
vDst0 = SIMD128::andnot_si(vMask, vDst0);
|
||||
vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
|
||||
vDst1 = SIMD128::andnot_si(vMask, vDst1);
|
||||
vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
|
||||
|
||||
SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
|
||||
SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template<SWR_FORMAT DstFormat>
|
||||
INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
|
||||
{
|
||||
|
@ -689,7 +573,6 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
|
|||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
|
||||
}
|
||||
|
||||
#endif
|
||||
template<SWR_FORMAT DstFormat>
|
||||
INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
|
||||
{
|
||||
|
@ -790,7 +673,6 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
|
|||
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
|
||||
}
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
template<SWR_FORMAT DstFormat>
|
||||
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
|
||||
{
|
||||
|
@ -854,7 +736,6 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8
|
|||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
|
||||
}
|
||||
|
||||
#endif
|
||||
template<SWR_FORMAT DstFormat>
|
||||
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
|
||||
{
|
||||
|
@ -947,11 +828,7 @@ struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -961,11 +838,7 @@ struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -975,11 +848,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -989,11 +858,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1003,11 +868,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1017,11 +878,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1031,11 +888,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1045,11 +898,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
|
|||
template <size_t NumDests>
|
||||
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
|
||||
#else
|
||||
FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1069,7 +918,6 @@ struct StoreRasterTile
|
|||
uint32_t x, uint32_t y,
|
||||
float outputColor[4])
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
|
||||
|
||||
SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
|
||||
|
@ -1083,21 +931,6 @@ struct StoreRasterTile
|
|||
uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
|
||||
|
||||
pSimdTile->GetSwizzledColor(simdOffset, outputColor);
|
||||
#else
|
||||
typedef SimdTile<SrcFormat, DstFormat> SimdT;
|
||||
|
||||
SimdT* pSrcSimdTiles = (SimdT*)pSrc;
|
||||
|
||||
// Compute which simd tile we're accessing within 8x8 tile.
|
||||
// i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
|
||||
uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
|
||||
|
||||
SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
|
||||
|
||||
uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
|
||||
|
||||
pSimdTile->GetSwizzledColor(simdOffset, outputColor);
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
@ -1230,7 +1063,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
|
|||
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
|
@ -1262,27 +1094,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
|
|||
ppDsts[2] += dy;
|
||||
ppDsts[3] += dy;
|
||||
}
|
||||
#else
|
||||
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
|
||||
{
|
||||
uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
|
||||
|
||||
for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
|
||||
{
|
||||
// Format conversion and convert from SOA to AOS, and store the rows.
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
|
||||
|
||||
ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
|
||||
}
|
||||
|
||||
ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
|
||||
ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1317,7 +1128,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat
|
|||
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
|
@ -1349,27 +1159,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat
|
|||
ppDsts[2] += dy;
|
||||
ppDsts[3] += dy;
|
||||
}
|
||||
#else
|
||||
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
|
||||
{
|
||||
uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
|
||||
|
||||
for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
|
||||
{
|
||||
// Format conversion and convert from SOA to AOS, and store the rows.
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
|
||||
|
||||
ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
|
||||
}
|
||||
|
||||
ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
|
||||
ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1404,7 +1193,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat
|
|||
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
|
@ -1436,27 +1224,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat
|
|||
ppDsts[2] += dy;
|
||||
ppDsts[3] += dy;
|
||||
}
|
||||
#else
|
||||
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
|
||||
{
|
||||
uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
|
||||
|
||||
for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
|
||||
{
|
||||
// Format conversion and convert from SOA to AOS, and store the rows.
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
|
||||
|
||||
ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
|
||||
}
|
||||
|
||||
ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
|
||||
ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1470,10 +1237,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat
|
|||
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
|
||||
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
|
||||
static const size_t MAX_DST_COLUMN_BYTES = 16;
|
||||
#if !USE_8x2_TILE_BACKEND
|
||||
static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
|
||||
static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Stores an 8x8 raster tile to the destination surface.
|
||||
|
@ -1496,7 +1259,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat
|
|||
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
|
||||
|
@ -1530,43 +1292,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat
|
|||
ppDsts[i] += dy;
|
||||
}
|
||||
}
|
||||
#else
|
||||
uint8_t* ppDsts[] =
|
||||
{
|
||||
pDst, // row 0, col 0
|
||||
pDst + pDstSurface->pitch, // row 1, col 0
|
||||
pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
|
||||
pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
|
||||
};
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
|
||||
{
|
||||
uint8_t* ppStartRows[] =
|
||||
{
|
||||
ppDsts[0],
|
||||
ppDsts[1],
|
||||
ppDsts[2],
|
||||
ppDsts[3],
|
||||
};
|
||||
|
||||
for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
|
||||
{
|
||||
// Format conversion and convert from SOA to AOS, and store the rows.
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
|
||||
ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
|
||||
pSrc += SRC_COLUMN_BYTES;
|
||||
}
|
||||
|
||||
ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
|
||||
ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
|
||||
ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
|
||||
ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1580,10 +1305,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma
|
|||
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
|
||||
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
|
||||
static const size_t MAX_DST_COLUMN_BYTES = 16;
|
||||
#if !USE_8x2_TILE_BACKEND
|
||||
static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
|
||||
static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Stores an 8x8 raster tile to the destination surface.
|
||||
|
@ -1606,7 +1327,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma
|
|||
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
||||
const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
|
||||
const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
|
||||
|
@ -1648,51 +1368,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma
|
|||
ppDsts[i] += dy;
|
||||
}
|
||||
}
|
||||
#else
|
||||
struct DstPtrs
|
||||
{
|
||||
uint8_t* ppDsts[8];
|
||||
} ptrs;
|
||||
|
||||
// Need 8 pointers, 4 columns of 2 rows each
|
||||
for (uint32_t y = 0; y < 2; ++y)
|
||||
{
|
||||
for (uint32_t x = 0; x < 4; ++x)
|
||||
{
|
||||
ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
|
||||
{
|
||||
DstPtrs startPtrs = ptrs;
|
||||
|
||||
for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
|
||||
{
|
||||
// Format conversion and convert from SOA to AOS, and store the rows.
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
|
||||
|
||||
ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
|
||||
pSrc += SRC_COLUMN_BYTES;
|
||||
}
|
||||
|
||||
ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
|
||||
ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
|
||||
ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
|
||||
ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
|
||||
ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
|
||||
ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
|
||||
ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
|
||||
ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1728,7 +1403,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst
|
|||
|
||||
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
|
||||
// We can compute the offsets to each column within the raster tile once and increment from these.
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
@ -1758,32 +1432,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst
|
|||
ppDsts[2] += dy;
|
||||
ppDsts[3] += dy;
|
||||
}
|
||||
#else
|
||||
// There will be 8 4x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
||||
// Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
|
||||
uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
|
||||
|
||||
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
uint32_t rowOffset = row * DestRowWidthBytes;
|
||||
|
||||
uint8_t* pRow = pCol0 + rowOffset;
|
||||
uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
|
||||
ppDsts[0] += DestRowWidthBytes / 4;
|
||||
ppDsts[1] += DestRowWidthBytes / 4;
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1819,7 +1467,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds
|
|||
|
||||
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
|
||||
// We can compute the offsets to each column within the raster tile once and increment from these.
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
@ -1849,32 +1496,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds
|
|||
ppDsts[2] += dy;
|
||||
ppDsts[3] += dy;
|
||||
}
|
||||
#else
|
||||
// There will be 8 4x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
||||
// Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
|
||||
uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
|
||||
|
||||
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
uint32_t rowOffset = row * DestRowWidthBytes;
|
||||
|
||||
uint8_t* pRow = pCol0 + rowOffset;
|
||||
uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
|
||||
ppDsts[0] += DestRowWidthBytes / 2;
|
||||
ppDsts[1] += DestRowWidthBytes / 2;
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1911,7 +1532,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, Ds
|
|||
|
||||
// TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
|
||||
// We can compute the offsets to each column within the raster tile once and increment from these.
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
||||
|
@ -1945,28 +1565,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, Ds
|
|||
ppDsts[2] += dy;
|
||||
ppDsts[3] += dy;
|
||||
}
|
||||
#else
|
||||
uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
|
||||
{
|
||||
uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
|
||||
|
||||
uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
|
||||
// Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
|
||||
pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
|
||||
}
|
||||
|
||||
pRow0 += (DestRowWidthBytes * 2);
|
||||
pRow1 += (DestRowWidthBytes * 2);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2003,7 +1601,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, Ds
|
|||
|
||||
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
|
||||
// We can compute the offsets to each column within the raster tile once and increment from these.
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
@ -2034,32 +1631,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, Ds
|
|||
ppDsts[2] += dy;
|
||||
ppDsts[3] += dy;
|
||||
}
|
||||
#else
|
||||
// There will be 8 4x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
||||
// Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
|
||||
uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
|
||||
|
||||
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
uint32_t rowOffset = row * DestRowWidthBytes;
|
||||
|
||||
uint8_t* pRow = pCol0 + rowOffset;
|
||||
uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
|
||||
ppDsts[0] += DestColumnBytes;
|
||||
ppDsts[1] += DestColumnBytes;
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2096,7 +1667,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds
|
|||
|
||||
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
|
||||
// We can compute the offsets to each column within the raster tile once and increment from these.
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
@ -2131,40 +1701,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds
|
|||
ppDsts[i] += dy;
|
||||
}
|
||||
}
|
||||
#else
|
||||
// There will be 8 4x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
uint8_t* pCol1 = pCol0 + DestColumnBytes;
|
||||
|
||||
// There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
|
||||
// Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
|
||||
uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
|
||||
|
||||
// The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
uint32_t rowOffset = row * DestRowWidthBytes;
|
||||
uint8_t* ppDsts[] =
|
||||
{
|
||||
pCol0 + rowOffset,
|
||||
pCol0 + rowOffset + DestRowWidthBytes,
|
||||
pCol1 + rowOffset,
|
||||
pCol1 + rowOffset + DestRowWidthBytes,
|
||||
};
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
|
||||
ppDsts[0] += DestColumnBytes * 2;
|
||||
ppDsts[1] += DestColumnBytes * 2;
|
||||
ppDsts[2] += DestColumnBytes * 2;
|
||||
ppDsts[3] += DestColumnBytes * 2;
|
||||
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
|
||||
pSrc += pSrcInc;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2175,22 +1711,8 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
|
|||
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
|
||||
{
|
||||
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
|
||||
|
||||
#else
|
||||
static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
|
||||
static const size_t TILE_Y_ROWS = 32;
|
||||
static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
|
||||
|
||||
static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
|
||||
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
|
||||
static const size_t MAX_DST_COLUMN_BYTES = 16;
|
||||
|
||||
static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
|
||||
static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Stores an 8x8 raster tile to the destination surface.
|
||||
/// @param pSrc - Pointer to raster tile.
|
||||
|
@ -2201,10 +1723,8 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
|
|||
SWR_SURFACE_STATE* pDstSurface,
|
||||
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
|
||||
{
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
static const uint32_t DestRowWidthBytes = 16; // 16B rows
|
||||
static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
|
||||
#endif
|
||||
|
||||
// Punt non-full tiles to generic store
|
||||
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
|
||||
|
@ -2217,7 +1737,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
|
|||
|
||||
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
|
||||
// We can compute the offsets to each column within the raster tile once and increment from these.
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
// There will be 4 8x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
|
@ -2260,54 +1779,6 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
|
|||
ppDsts[i] += dy;
|
||||
}
|
||||
}
|
||||
#else
|
||||
// There will be 8 4x2 simd tiles in an 8x8 raster tile.
|
||||
uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
|
||||
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
|
||||
struct DstPtrs
|
||||
{
|
||||
uint8_t* ppDsts[8];
|
||||
} ptrs;
|
||||
|
||||
// Need 8 pointers, 4 columns of 2 rows each
|
||||
for (uint32_t y = 0; y < 2; ++y)
|
||||
{
|
||||
for (uint32_t x = 0; x < 4; ++x)
|
||||
{
|
||||
ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
|
||||
{
|
||||
DstPtrs startPtrs = ptrs;
|
||||
|
||||
for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
|
||||
{
|
||||
// Format conversion and convert from SOA to AOS, and store the rows.
|
||||
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
|
||||
|
||||
ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
|
||||
ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
|
||||
pSrc += SRC_COLUMN_BYTES;
|
||||
}
|
||||
|
||||
ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue