swr: [rasterizer core] Refactor/cleanup backends
Used for common code reuse and simplification Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
78a0a09e48
commit
06f93d0329
|
@ -451,134 +451,95 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
|
||||||
AR_BEGIN(BESingleSampleBackend, pDC->drawId);
|
AR_BEGIN(BESingleSampleBackend, pDC->drawId);
|
||||||
AR_BEGIN(BESetup, pDC->drawId);
|
AR_BEGIN(BESetup, pDC->drawId);
|
||||||
|
|
||||||
const API_STATE& state = GetApiState(pDC);
|
const API_STATE &state = GetApiState(pDC);
|
||||||
const SWR_RASTSTATE& rastState = state.rastState;
|
|
||||||
const SWR_PS_STATE *pPSState = &state.psState;
|
|
||||||
const SWR_BLEND_STATE *pBlendState = &state.blendState;
|
|
||||||
uint64_t coverageMask = work.coverageMask[0];
|
|
||||||
|
|
||||||
// broadcast scalars
|
|
||||||
BarycentricCoeffs coeffs;
|
BarycentricCoeffs coeffs;
|
||||||
coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
|
SetupBarycentricCoeffs(&coeffs, work);
|
||||||
coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
|
|
||||||
coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
|
|
||||||
|
|
||||||
coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
|
uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
|
||||||
coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
|
SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
|
||||||
coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
|
|
||||||
|
|
||||||
coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
|
|
||||||
coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
|
|
||||||
coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
|
|
||||||
|
|
||||||
coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
|
|
||||||
|
|
||||||
coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
|
|
||||||
coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
|
|
||||||
coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
|
|
||||||
|
|
||||||
uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
|
|
||||||
uint32_t NumRT = state.psState.numRenderTargets;
|
|
||||||
for(uint32_t rt = 0; rt < NumRT; ++rt)
|
|
||||||
{
|
|
||||||
pColorBase[rt] = renderBuffers.pColor[rt];
|
|
||||||
}
|
|
||||||
uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
|
|
||||||
AR_END(BESetup, 1);
|
|
||||||
|
|
||||||
SWR_PS_CONTEXT psContext;
|
SWR_PS_CONTEXT psContext;
|
||||||
psContext.pAttribs = work.pAttribs;
|
SetupPixelShaderContext<T>(&psContext, work);
|
||||||
psContext.pPerspAttribs = work.pPerspAttribs;
|
|
||||||
psContext.frontFace = work.triFlags.frontFacing;
|
|
||||||
psContext.primID = work.triFlags.primID;
|
|
||||||
|
|
||||||
// save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
|
AR_END(BESetup, 1);
|
||||||
psContext.I = work.I;
|
|
||||||
psContext.J = work.J;
|
|
||||||
psContext.recipDet = work.recipDet;
|
|
||||||
psContext.pRecipW = work.pRecipW;
|
|
||||||
psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
|
|
||||||
psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
|
|
||||||
psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
|
|
||||||
|
|
||||||
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||||
|
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||||
|
|
||||||
|
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
|
||||||
|
|
||||||
|
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
||||||
{
|
{
|
||||||
// UL pixel corner
|
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||||
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
|
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||||
// pixel center
|
|
||||||
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
|
|
||||||
|
|
||||||
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
|
||||||
|
|
||||||
|
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||||
{
|
{
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
if(coverageMask & MASK)
|
simdmask coverageMask = work.coverageMask[0] & MASK;
|
||||||
|
|
||||||
|
if (coverageMask)
|
||||||
{
|
{
|
||||||
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
|
|
||||||
// pixel center
|
|
||||||
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
|
|
||||||
|
|
||||||
if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
|
||||||
{
|
|
||||||
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
|
|
||||||
&work.coverageMask[0];
|
|
||||||
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
|
|
||||||
}
|
|
||||||
|
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
|
||||||
CalcPixelBarycentrics(coeffs, psContext);
|
|
||||||
|
|
||||||
// for 1x case, centroid is pixel center
|
|
||||||
psContext.vX.centroid = psContext.vX.center;
|
|
||||||
psContext.vY.centroid = psContext.vY.center;
|
|
||||||
psContext.vI.centroid = psContext.vI.center;
|
|
||||||
psContext.vJ.centroid = psContext.vJ.center;
|
|
||||||
psContext.vOneOverW.centroid = psContext.vOneOverW.center;
|
|
||||||
|
|
||||||
// interpolate and quantize z
|
|
||||||
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
|
|
||||||
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
|
|
||||||
AR_END(BEBarycentric, 1);
|
|
||||||
|
|
||||||
simdmask clipCoverageMask = coverageMask & MASK;
|
|
||||||
// interpolate user clip distance if available
|
|
||||||
if (rastState.clipDistanceMask)
|
|
||||||
{
|
|
||||||
clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
|
|
||||||
psContext.vI.center, psContext.vJ.center);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
||||||
{
|
{
|
||||||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
|
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
|
||||||
|
|
||||||
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBase));
|
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
|
||||||
|
|
||||||
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
||||||
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
||||||
|
|
||||||
clipCoverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
||||||
}
|
}
|
||||||
|
|
||||||
simdscalar vCoverageMask = vMask(clipCoverageMask);
|
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
||||||
|
{
|
||||||
|
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
|
||||||
|
|
||||||
|
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
|
||||||
|
}
|
||||||
|
|
||||||
|
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||||
|
|
||||||
|
CalcPixelBarycentrics(coeffs, psContext);
|
||||||
|
|
||||||
|
CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
|
||||||
|
|
||||||
|
// interpolate and quantize z
|
||||||
|
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
|
||||||
|
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
|
||||||
|
|
||||||
|
AR_END(BEBarycentric, 1);
|
||||||
|
|
||||||
|
// interpolate user clip distance if available
|
||||||
|
if (state.rastState.clipDistanceMask)
|
||||||
|
{
|
||||||
|
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
|
||||||
|
}
|
||||||
|
|
||||||
|
simdscalar vCoverageMask = vMask(coverageMask);
|
||||||
simdscalar depthPassMask = vCoverageMask;
|
simdscalar depthPassMask = vCoverageMask;
|
||||||
simdscalar stencilPassMask = vCoverageMask;
|
simdscalar stencilPassMask = vCoverageMask;
|
||||||
|
|
||||||
// Early-Z?
|
// Early-Z?
|
||||||
if(T::bCanEarlyZ)
|
if (T::bCanEarlyZ)
|
||||||
{
|
{
|
||||||
AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
|
AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
|
||||||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
|
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
|
||||||
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
|
psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
|
||||||
AR_END(BEEarlyDepthTest, 0);
|
AR_END(BEEarlyDepthTest, 0);
|
||||||
|
|
||||||
// early-exit if no pixels passed depth or earlyZ is forced on
|
// early-exit if no pixels passed depth or earlyZ is forced on
|
||||||
if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
|
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
|
||||||
{
|
{
|
||||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
||||||
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
|
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
|
||||||
|
|
||||||
if (!_simd_movemask_ps(depthPassMask))
|
if (!_simd_movemask_ps(depthPassMask))
|
||||||
{
|
{
|
||||||
|
@ -599,18 +560,18 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
|
||||||
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
|
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
|
||||||
|
|
||||||
// late-Z
|
// late-Z
|
||||||
if(!T::bCanEarlyZ)
|
if (!T::bCanEarlyZ)
|
||||||
{
|
{
|
||||||
AR_BEGIN(BELateDepthTest, pDC->drawId);
|
AR_BEGIN(BELateDepthTest, pDC->drawId);
|
||||||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
|
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
|
||||||
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
|
psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
|
||||||
AR_END(BELateDepthTest, 0);
|
AR_END(BELateDepthTest, 0);
|
||||||
|
|
||||||
if(!_simd_movemask_ps(depthPassMask))
|
if (!_simd_movemask_ps(depthPassMask))
|
||||||
{
|
{
|
||||||
// need to call depth/stencil write for stencil write
|
// need to call depth/stencil write for stencil write
|
||||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
||||||
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
|
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
|
||||||
goto Endtile;
|
goto Endtile;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -622,47 +583,56 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
|
||||||
// output merger
|
// output merger
|
||||||
AR_BEGIN(BEOutputMerger, pDC->drawId);
|
AR_BEGIN(BEOutputMerger, pDC->drawId);
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets, useAlternateOffset);
|
OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
|
||||||
#else
|
#else
|
||||||
OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
|
OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// do final depth write after all pixel kills
|
// do final depth write after all pixel kills
|
||||||
if (!pPSState->forceEarlyZ)
|
if (!state.psState.forceEarlyZ)
|
||||||
{
|
{
|
||||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
||||||
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
|
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
|
||||||
}
|
}
|
||||||
AR_END(BEOutputMerger, 0);
|
AR_END(BEOutputMerger, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Endtile:
|
Endtile:
|
||||||
AR_BEGIN(BEEndTile, pDC->drawId);
|
AR_BEGIN(BEEndTile, pDC->drawId);
|
||||||
coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
|
||||||
|
work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||||
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
||||||
{
|
{
|
||||||
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||||
}
|
}
|
||||||
pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
|
||||||
pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
|
||||||
|
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
if (useAlternateOffset)
|
if (useAlternateOffset)
|
||||||
{
|
{
|
||||||
for (uint32_t rt = 0; rt < NumRT; ++rt)
|
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
|
||||||
{
|
{
|
||||||
pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (uint32_t rt = 0; rt < NumRT; ++rt)
|
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
|
||||||
{
|
{
|
||||||
pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
|
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
|
|
||||||
AR_END(BEEndTile, 0);
|
AR_END(BEEndTile, 0);
|
||||||
|
|
||||||
|
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
|
||||||
|
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
|
||||||
|
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
|
||||||
}
|
}
|
||||||
|
|
||||||
AR_END(BESingleSampleBackend, 0);
|
AR_END(BESingleSampleBackend, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -674,132 +644,61 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
|
||||||
AR_BEGIN(BESampleRateBackend, pDC->drawId);
|
AR_BEGIN(BESampleRateBackend, pDC->drawId);
|
||||||
AR_BEGIN(BESetup, pDC->drawId);
|
AR_BEGIN(BESetup, pDC->drawId);
|
||||||
|
|
||||||
const API_STATE& state = GetApiState(pDC);
|
const API_STATE &state = GetApiState(pDC);
|
||||||
const SWR_RASTSTATE& rastState = state.rastState;
|
|
||||||
const SWR_PS_STATE *pPSState = &state.psState;
|
|
||||||
const SWR_BLEND_STATE *pBlendState = &state.blendState;
|
|
||||||
|
|
||||||
// broadcast scalars
|
|
||||||
BarycentricCoeffs coeffs;
|
BarycentricCoeffs coeffs;
|
||||||
coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
|
SetupBarycentricCoeffs(&coeffs, work);
|
||||||
coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
|
|
||||||
coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
|
|
||||||
|
|
||||||
coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
|
uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
|
||||||
coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
|
SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
|
||||||
coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
|
|
||||||
|
|
||||||
coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
|
|
||||||
coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
|
|
||||||
coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
|
|
||||||
|
|
||||||
coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
|
|
||||||
|
|
||||||
coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
|
|
||||||
coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
|
|
||||||
coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
|
|
||||||
|
|
||||||
uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
|
|
||||||
uint32_t NumRT = state.psState.numRenderTargets;
|
|
||||||
for(uint32_t rt = 0; rt < NumRT; ++rt)
|
|
||||||
{
|
|
||||||
pColorBase[rt] = renderBuffers.pColor[rt];
|
|
||||||
}
|
|
||||||
uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
|
|
||||||
AR_END(BESetup, 0);
|
|
||||||
|
|
||||||
SWR_PS_CONTEXT psContext;
|
SWR_PS_CONTEXT psContext;
|
||||||
psContext.pAttribs = work.pAttribs;
|
SetupPixelShaderContext<T>(&psContext, work);
|
||||||
psContext.pPerspAttribs = work.pPerspAttribs;
|
|
||||||
psContext.pRecipW = work.pRecipW;
|
|
||||||
psContext.frontFace = work.triFlags.frontFacing;
|
|
||||||
psContext.primID = work.triFlags.primID;
|
|
||||||
|
|
||||||
// save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
|
AR_END(BESetup, 0);
|
||||||
psContext.I = work.I;
|
|
||||||
psContext.J = work.J;
|
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||||
psContext.recipDet = work.recipDet;
|
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||||
psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
|
|
||||||
psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
|
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
|
||||||
psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
|
|
||||||
|
|
||||||
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
||||||
{
|
{
|
||||||
// UL pixel corner
|
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||||
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
|
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||||
// pixel center
|
|
||||||
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
|
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
|
||||||
|
|
||||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||||
{
|
{
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
|
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
||||||
// pixel center
|
{
|
||||||
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
|
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
|
||||||
|
|
||||||
|
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
|
||||||
|
}
|
||||||
|
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||||
|
|
||||||
CalcPixelBarycentrics(coeffs, psContext);
|
CalcPixelBarycentrics(coeffs, psContext);
|
||||||
|
|
||||||
|
CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
|
||||||
|
|
||||||
AR_END(BEBarycentric, 0);
|
AR_END(BEBarycentric, 0);
|
||||||
|
|
||||||
if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
|
||||||
{
|
|
||||||
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
|
|
||||||
&work.coverageMask[0];
|
|
||||||
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(T::bCentroidPos)
|
|
||||||
{
|
|
||||||
///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
|
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
|
||||||
if(T::bIsStandardPattern)
|
|
||||||
{
|
|
||||||
CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
|
|
||||||
psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
|
|
||||||
}
|
|
||||||
CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
|
|
||||||
AR_END(BEBarycentric, 0);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
psContext.vX.centroid = psContext.vX.sample;
|
|
||||||
psContext.vY.centroid = psContext.vY.sample;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
|
|
||||||
{
|
{
|
||||||
simdmask coverageMask = work.coverageMask[sample] & MASK;
|
simdmask coverageMask = work.coverageMask[sample] & MASK;
|
||||||
|
|
||||||
if (coverageMask)
|
if (coverageMask)
|
||||||
{
|
{
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
|
||||||
// calculate per sample positions
|
|
||||||
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
|
|
||||||
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
|
|
||||||
|
|
||||||
CalcSampleBarycentrics(coeffs, psContext);
|
|
||||||
|
|
||||||
// interpolate and quantize z
|
|
||||||
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
|
|
||||||
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
|
|
||||||
AR_END(BEBarycentric, 0);
|
|
||||||
|
|
||||||
// interpolate user clip distance if available
|
|
||||||
if (rastState.clipDistanceMask)
|
|
||||||
{
|
|
||||||
coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
|
|
||||||
psContext.vI.sample, psContext.vJ.sample);
|
|
||||||
}
|
|
||||||
|
|
||||||
// offset depth/stencil buffers current sample
|
// offset depth/stencil buffers current sample
|
||||||
uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
|
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
|
||||||
uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
|
uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
|
||||||
|
|
||||||
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
||||||
{
|
{
|
||||||
|
@ -813,6 +712,26 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
|
||||||
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||||
|
|
||||||
|
// calculate per sample positions
|
||||||
|
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
|
||||||
|
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
|
||||||
|
|
||||||
|
CalcSampleBarycentrics(coeffs, psContext);
|
||||||
|
|
||||||
|
// interpolate and quantize z
|
||||||
|
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
|
||||||
|
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
|
||||||
|
|
||||||
|
AR_END(BEBarycentric, 0);
|
||||||
|
|
||||||
|
// interpolate user clip distance if available
|
||||||
|
if (state.rastState.clipDistanceMask)
|
||||||
|
{
|
||||||
|
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
|
||||||
|
}
|
||||||
|
|
||||||
simdscalar vCoverageMask = vMask(coverageMask);
|
simdscalar vCoverageMask = vMask(coverageMask);
|
||||||
simdscalar depthPassMask = vCoverageMask;
|
simdscalar depthPassMask = vCoverageMask;
|
||||||
simdscalar stencilPassMask = vCoverageMask;
|
simdscalar stencilPassMask = vCoverageMask;
|
||||||
|
@ -826,7 +745,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
|
||||||
AR_END(BEEarlyDepthTest, 0);
|
AR_END(BEEarlyDepthTest, 0);
|
||||||
|
|
||||||
// early-exit if no samples passed depth or earlyZ is forced on.
|
// early-exit if no samples passed depth or earlyZ is forced on.
|
||||||
if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
|
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
|
||||||
{
|
{
|
||||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
||||||
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
|
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
|
||||||
|
@ -876,13 +795,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
|
||||||
// output merger
|
// output merger
|
||||||
AR_BEGIN(BEOutputMerger, pDC->drawId);
|
AR_BEGIN(BEOutputMerger, pDC->drawId);
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets, useAlternateOffset);
|
OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
|
||||||
#else
|
#else
|
||||||
OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
|
OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// do final depth write after all pixel kills
|
// do final depth write after all pixel kills
|
||||||
if (!pPSState->forceEarlyZ)
|
if (!state.psState.forceEarlyZ)
|
||||||
{
|
{
|
||||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
|
||||||
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
|
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
|
||||||
|
@ -894,31 +813,41 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
|
||||||
|
|
||||||
Endtile:
|
Endtile:
|
||||||
ATTR_UNUSED;
|
ATTR_UNUSED;
|
||||||
|
|
||||||
AR_BEGIN(BEEndTile, pDC->drawId);
|
AR_BEGIN(BEEndTile, pDC->drawId);
|
||||||
|
|
||||||
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
||||||
{
|
{
|
||||||
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||||
}
|
}
|
||||||
pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
|
||||||
pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
|
||||||
|
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
if (useAlternateOffset)
|
if (useAlternateOffset)
|
||||||
{
|
{
|
||||||
for (uint32_t rt = 0; rt < NumRT; ++rt)
|
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
|
||||||
{
|
{
|
||||||
pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (uint32_t rt = 0; rt < NumRT; ++rt)
|
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
|
||||||
{
|
{
|
||||||
pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
|
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
|
|
||||||
AR_END(BEEndTile, 0);
|
AR_END(BEEndTile, 0);
|
||||||
|
|
||||||
|
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
|
||||||
|
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
|
||||||
|
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
|
||||||
}
|
}
|
||||||
|
|
||||||
AR_END(BESampleRateBackend, 0);
|
AR_END(BESampleRateBackend, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -930,62 +859,33 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
|
||||||
AR_BEGIN(BEPixelRateBackend, pDC->drawId);
|
AR_BEGIN(BEPixelRateBackend, pDC->drawId);
|
||||||
AR_BEGIN(BESetup, pDC->drawId);
|
AR_BEGIN(BESetup, pDC->drawId);
|
||||||
|
|
||||||
const API_STATE& state = GetApiState(pDC);
|
const API_STATE &state = GetApiState(pDC);
|
||||||
const SWR_RASTSTATE& rastState = state.rastState;
|
|
||||||
const SWR_PS_STATE *pPSState = &state.psState;
|
|
||||||
const SWR_BLEND_STATE *pBlendState = &state.blendState;
|
|
||||||
|
|
||||||
// broadcast scalars
|
|
||||||
BarycentricCoeffs coeffs;
|
BarycentricCoeffs coeffs;
|
||||||
coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
|
SetupBarycentricCoeffs(&coeffs, work);
|
||||||
coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
|
|
||||||
coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
|
|
||||||
|
|
||||||
coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
|
uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
|
||||||
coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
|
SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
|
||||||
coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
|
|
||||||
|
|
||||||
coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
|
|
||||||
coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
|
|
||||||
coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
|
|
||||||
|
|
||||||
coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
|
|
||||||
|
|
||||||
coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
|
|
||||||
coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
|
|
||||||
coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
|
|
||||||
|
|
||||||
uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
|
|
||||||
uint32_t NumRT = state.psState.numRenderTargets;
|
|
||||||
for(uint32_t rt = 0; rt < NumRT; ++rt)
|
|
||||||
{
|
|
||||||
pColorBase[rt] = renderBuffers.pColor[rt];
|
|
||||||
}
|
|
||||||
uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
|
|
||||||
AR_END(BESetup, 0);
|
|
||||||
|
|
||||||
SWR_PS_CONTEXT psContext;
|
SWR_PS_CONTEXT psContext;
|
||||||
psContext.pAttribs = work.pAttribs;
|
SetupPixelShaderContext<T>(&psContext, work);
|
||||||
psContext.pPerspAttribs = work.pPerspAttribs;
|
|
||||||
psContext.frontFace = work.triFlags.frontFacing;
|
|
||||||
psContext.primID = work.triFlags.primID;
|
|
||||||
psContext.pRecipW = work.pRecipW;
|
|
||||||
// save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
|
|
||||||
psContext.I = work.I;
|
|
||||||
psContext.J = work.J;
|
|
||||||
psContext.recipDet = work.recipDet;
|
|
||||||
psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
|
|
||||||
psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
|
|
||||||
psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
|
|
||||||
|
|
||||||
psContext.sampleIndex = 0;
|
AR_END(BESetup, 0);
|
||||||
|
|
||||||
PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
|
PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
|
||||||
|
|
||||||
|
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||||
|
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||||
|
|
||||||
|
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
|
||||||
|
|
||||||
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
||||||
{
|
{
|
||||||
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
|
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||||
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
|
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||||
|
|
||||||
|
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
|
||||||
|
|
||||||
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||||
{
|
{
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
|
@ -996,48 +896,25 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
|
||||||
if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
|
if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
|
||||||
activeLanes = vMask(work.anyCoveredSamples & MASK);
|
activeLanes = vMask(work.anyCoveredSamples & MASK);
|
||||||
|
|
||||||
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
|
|
||||||
// set pixel center positions
|
|
||||||
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
|
|
||||||
|
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
|
||||||
CalcPixelBarycentrics(coeffs, psContext);
|
|
||||||
AR_END(BEBarycentric, 0);
|
|
||||||
|
|
||||||
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
||||||
{
|
{
|
||||||
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
|
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
|
||||||
&work.coverageMask[0];
|
|
||||||
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
|
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(T::bCentroidPos)
|
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||||
{
|
|
||||||
///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
|
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
|
||||||
if(T::bIsStandardPattern)
|
|
||||||
{
|
|
||||||
CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
|
|
||||||
psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
|
|
||||||
}
|
|
||||||
|
|
||||||
CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
|
CalcPixelBarycentrics(coeffs, psContext);
|
||||||
AR_END(BEBarycentric, 0);
|
|
||||||
}
|
CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
|
||||||
else
|
|
||||||
{
|
AR_END(BEBarycentric, 0);
|
||||||
psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
|
|
||||||
psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
|
|
||||||
}
|
|
||||||
|
|
||||||
if(T::bForcedSampleCount)
|
if(T::bForcedSampleCount)
|
||||||
{
|
{
|
||||||
// candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
|
// candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
|
||||||
const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
|
const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
|
||||||
activeLanes = _simd_and_ps(activeLanes, vSampleMask);
|
activeLanes = _simd_and_ps(activeLanes, vSampleMask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1051,7 +928,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
|
||||||
// if we have no covered samples that passed depth at this point, go to next tile
|
// if we have no covered samples that passed depth at this point, go to next tile
|
||||||
if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
|
if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
|
||||||
|
|
||||||
if(pPSState->usesSourceDepth)
|
if(state.psState.usesSourceDepth)
|
||||||
{
|
{
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||||
// interpolate and quantize z
|
// interpolate and quantize z
|
||||||
|
@ -1086,7 +963,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
|
||||||
|
|
||||||
// output merger
|
// output merger
|
||||||
// loop over all samples, broadcasting the results of the PS to all passing pixels
|
// loop over all samples, broadcasting the results of the PS to all passing pixels
|
||||||
for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
|
for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
|
||||||
{
|
{
|
||||||
AR_BEGIN(BEOutputMerger, pDC->drawId);
|
AR_BEGIN(BEOutputMerger, pDC->drawId);
|
||||||
// center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
|
// center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
|
||||||
|
@ -1110,15 +987,15 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
|
||||||
|
|
||||||
// broadcast the results of the PS to all passing pixels
|
// broadcast the results of the PS to all passing pixels
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets, useAlternateOffset);
|
OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, useAlternateOffset);
|
||||||
#else
|
#else
|
||||||
OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
|
OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
|
if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
|
||||||
{
|
{
|
||||||
uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
|
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
|
||||||
uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
|
uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
|
||||||
|
|
||||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
|
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
|
||||||
pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
|
pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
|
||||||
|
@ -1127,6 +1004,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
|
||||||
}
|
}
|
||||||
Endtile:
|
Endtile:
|
||||||
AR_BEGIN(BEEndTile, pDC->drawId);
|
AR_BEGIN(BEEndTile, pDC->drawId);
|
||||||
|
|
||||||
for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
|
for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
|
||||||
{
|
{
|
||||||
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||||
|
@ -1137,26 +1015,34 @@ Endtile:
|
||||||
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||||
}
|
}
|
||||||
work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||||
pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
|
||||||
pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
|
||||||
|
|
||||||
#if USE_8x2_TILE_BACKEND
|
#if USE_8x2_TILE_BACKEND
|
||||||
if (useAlternateOffset)
|
if (useAlternateOffset)
|
||||||
{
|
{
|
||||||
for (uint32_t rt = 0; rt < NumRT; ++rt)
|
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
|
||||||
{
|
{
|
||||||
pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for(uint32_t rt = 0; rt < NumRT; ++rt)
|
for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
|
||||||
{
|
{
|
||||||
pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
}
|
}
|
||||||
|
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
|
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
AR_END(BEEndTile, 0);
|
AR_END(BEEndTile, 0);
|
||||||
|
|
||||||
|
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
|
||||||
|
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
|
||||||
|
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
|
||||||
}
|
}
|
||||||
|
|
||||||
AR_END(BEPixelRateBackend, 0);
|
AR_END(BEPixelRateBackend, 0);
|
||||||
}
|
}
|
||||||
// optimized backend flow with NULL PS
|
// optimized backend flow with NULL PS
|
||||||
|
@ -1170,50 +1056,60 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
|
||||||
typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
|
typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
|
||||||
AR_BEGIN(BESetup, pDC->drawId);
|
AR_BEGIN(BESetup, pDC->drawId);
|
||||||
|
|
||||||
const API_STATE& state = GetApiState(pDC);
|
const API_STATE &state = GetApiState(pDC);
|
||||||
const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
|
|
||||||
|
|
||||||
// broadcast scalars
|
|
||||||
BarycentricCoeffs coeffs;
|
BarycentricCoeffs coeffs;
|
||||||
coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
|
SetupBarycentricCoeffs(&coeffs, work);
|
||||||
coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
|
|
||||||
coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
|
|
||||||
|
|
||||||
coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
|
uint8_t *pDepthBuffer, *pStencilBuffer;
|
||||||
coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
|
SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
|
||||||
coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
|
|
||||||
|
|
||||||
coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
|
SWR_PS_CONTEXT psContext;
|
||||||
coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
|
// skip SetupPixelShaderContext(&psContext, ...); // not needed here
|
||||||
coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
|
|
||||||
|
|
||||||
coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
|
|
||||||
|
|
||||||
uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
|
|
||||||
|
|
||||||
AR_END(BESetup, 0);
|
AR_END(BESetup, 0);
|
||||||
|
|
||||||
SWR_PS_CONTEXT psContext;
|
simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||||
|
|
||||||
|
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
|
||||||
|
|
||||||
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
||||||
{
|
{
|
||||||
// UL pixel corner
|
simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||||
simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
|
|
||||||
|
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
|
||||||
|
|
||||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||||
{
|
{
|
||||||
// UL pixel corners
|
|
||||||
simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
|
|
||||||
|
|
||||||
// iterate over active samples
|
// iterate over active samples
|
||||||
unsigned long sample = 0;
|
unsigned long sample = 0;
|
||||||
uint32_t sampleMask = state.blendState.sampleMask;
|
uint32_t sampleMask = state.blendState.sampleMask;
|
||||||
while (_BitScanForward(&sample, sampleMask))
|
while (_BitScanForward(&sample, sampleMask))
|
||||||
{
|
{
|
||||||
sampleMask &= ~(1 << sample);
|
sampleMask &= ~(1 << sample);
|
||||||
|
|
||||||
simdmask coverageMask = work.coverageMask[sample] & MASK;
|
simdmask coverageMask = work.coverageMask[sample] & MASK;
|
||||||
|
|
||||||
if (coverageMask)
|
if (coverageMask)
|
||||||
{
|
{
|
||||||
|
// offset depth/stencil buffers current sample
|
||||||
|
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
|
||||||
|
uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
|
||||||
|
|
||||||
|
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
||||||
|
{
|
||||||
|
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
|
||||||
|
|
||||||
|
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
|
||||||
|
|
||||||
|
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
||||||
|
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
||||||
|
|
||||||
|
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
||||||
|
}
|
||||||
|
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||||
|
|
||||||
// calculate per sample positions
|
// calculate per sample positions
|
||||||
psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
|
psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
|
||||||
psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
|
psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
|
||||||
|
@ -1227,26 +1123,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
|
||||||
AR_END(BEBarycentric, 0);
|
AR_END(BEBarycentric, 0);
|
||||||
|
|
||||||
// interpolate user clip distance if available
|
// interpolate user clip distance if available
|
||||||
if (rastState.clipDistanceMask)
|
if (state.rastState.clipDistanceMask)
|
||||||
{
|
{
|
||||||
coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
|
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
|
||||||
psContext.vI.sample, psContext.vJ.sample);
|
|
||||||
}
|
|
||||||
|
|
||||||
// offset depth/stencil buffers current sample
|
|
||||||
uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
|
|
||||||
uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
|
|
||||||
|
|
||||||
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
|
||||||
{
|
|
||||||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
|
|
||||||
|
|
||||||
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
|
|
||||||
|
|
||||||
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
|
||||||
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
|
||||||
|
|
||||||
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
simdscalar vCoverageMask = vMask(coverageMask);
|
simdscalar vCoverageMask = vMask(coverageMask);
|
||||||
|
@ -1268,10 +1147,16 @@ Endtile:
|
||||||
ATTR_UNUSED;
|
ATTR_UNUSED;
|
||||||
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||||
}
|
}
|
||||||
pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
|
||||||
pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
|
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||||
|
|
||||||
|
vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
|
||||||
}
|
}
|
||||||
|
|
||||||
AR_END(BENullBackend, 0);
|
AR_END(BENullBackend, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -438,13 +438,117 @@ INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
|
||||||
|
{
|
||||||
|
// broadcast scalars
|
||||||
|
|
||||||
|
coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
|
||||||
|
coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
|
||||||
|
coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
|
||||||
|
|
||||||
|
coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
|
||||||
|
coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
|
||||||
|
coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
|
||||||
|
|
||||||
|
coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
|
||||||
|
coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
|
||||||
|
coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
|
||||||
|
|
||||||
|
coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
|
||||||
|
|
||||||
|
coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
|
||||||
|
coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
|
||||||
|
coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
|
||||||
|
{
|
||||||
|
SWR_ASSERT(colorBufferCount <= SWR_NUM_RENDERTARGETS);
|
||||||
|
|
||||||
|
if (pColorBuffer)
|
||||||
|
{
|
||||||
|
for (uint32_t index = 0; index < colorBufferCount; index += 1)
|
||||||
|
{
|
||||||
|
pColorBuffer[index] = renderBuffers.pColor[index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pDepthBuffer)
|
||||||
|
{
|
||||||
|
*pDepthBuffer = renderBuffers.pDepth;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pStencilBuffer)
|
||||||
|
{
|
||||||
|
*pStencilBuffer = renderBuffers.pStencil;;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC &work)
|
||||||
|
{
|
||||||
|
psContext->pAttribs = work.pAttribs;
|
||||||
|
psContext->pPerspAttribs = work.pPerspAttribs;
|
||||||
|
psContext->frontFace = work.triFlags.frontFacing;
|
||||||
|
psContext->primID = work.triFlags.primID;
|
||||||
|
|
||||||
|
// save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
|
||||||
|
psContext->I = work.I;
|
||||||
|
psContext->J = work.J;
|
||||||
|
|
||||||
|
psContext->recipDet = work.recipDet;
|
||||||
|
psContext->pRecipW = work.pRecipW;
|
||||||
|
psContext->pSamplePosX = reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
|
||||||
|
psContext->pSamplePosY = reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
|
||||||
|
psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
|
||||||
|
psContext->sampleIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, bool IsSingleSample>
|
||||||
|
void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
|
||||||
|
{
|
||||||
|
if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
|
||||||
|
{
|
||||||
|
// for 1x case, centroid is pixel center
|
||||||
|
psContext->vX.centroid = psContext->vX.center;
|
||||||
|
psContext->vY.centroid = psContext->vY.center;
|
||||||
|
psContext->vI.centroid = psContext->vI.center;
|
||||||
|
psContext->vJ.centroid = psContext->vJ.center;
|
||||||
|
psContext->vOneOverW.centroid = psContext->vOneOverW.center;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (T::bCentroidPos)
|
||||||
|
{
|
||||||
|
///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
|
||||||
|
if (T::bIsStandardPattern)
|
||||||
|
{
|
||||||
|
// add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
|
||||||
|
CalcCentroidPos<T>(*psContext, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
|
||||||
|
psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
|
||||||
|
}
|
||||||
|
|
||||||
|
CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
psContext->vX.centroid = psContext->vX.sample;
|
||||||
|
psContext->vY.centroid = psContext->vY.sample;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct PixelRateZTestLoop
|
struct PixelRateZTestLoop
|
||||||
{
|
{
|
||||||
PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
|
PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
|
||||||
uint8_t*& depthBase, uint8_t*& stencilBase, const uint8_t ClipDistanceMask) :
|
uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
|
||||||
pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
|
pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
|
||||||
clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
|
clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {};
|
||||||
|
|
||||||
INLINE
|
INLINE
|
||||||
uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
|
uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
|
||||||
|
@ -465,7 +569,24 @@ struct PixelRateZTestLoop
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// offset depth/stencil buffers current sample
|
||||||
|
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
|
||||||
|
uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
|
||||||
|
|
||||||
|
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
||||||
|
{
|
||||||
|
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
|
||||||
|
|
||||||
|
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
|
||||||
|
|
||||||
|
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
||||||
|
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
||||||
|
|
||||||
|
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
|
||||||
|
}
|
||||||
|
|
||||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||||
|
|
||||||
// calculate per sample positions
|
// calculate per sample positions
|
||||||
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
|
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
|
||||||
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
|
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
|
||||||
|
@ -483,33 +604,18 @@ struct PixelRateZTestLoop
|
||||||
vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
|
vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
|
||||||
vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
|
vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
|
||||||
}
|
}
|
||||||
|
|
||||||
AR_END(BEBarycentric, 0);
|
AR_END(BEBarycentric, 0);
|
||||||
|
|
||||||
///@todo: perspective correct vs non-perspective correct clipping?
|
///@todo: perspective correct vs non-perspective correct clipping?
|
||||||
// if clip distances are enabled, we need to interpolate for each sample
|
// if clip distances are enabled, we need to interpolate for each sample
|
||||||
if(clipDistanceMask)
|
if(clipDistanceMask)
|
||||||
{
|
{
|
||||||
uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer,
|
uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
|
||||||
psContext.vI.sample, psContext.vJ.sample);
|
|
||||||
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
|
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
|
||||||
}
|
}
|
||||||
|
|
||||||
// offset depth/stencil buffers current sample
|
|
||||||
uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
|
|
||||||
uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
|
|
||||||
|
|
||||||
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
|
||||||
{
|
|
||||||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
|
|
||||||
|
|
||||||
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
|
|
||||||
|
|
||||||
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
|
||||||
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
|
||||||
|
|
||||||
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// ZTest for this sample
|
// ZTest for this sample
|
||||||
///@todo Need to uncomment out this bucket.
|
///@todo Need to uncomment out this bucket.
|
||||||
//AR_BEGIN(BEDepthBucket, pDC->drawId);
|
//AR_BEGIN(BEDepthBucket, pDC->drawId);
|
||||||
|
@ -557,8 +663,8 @@ private:
|
||||||
const API_STATE& state;
|
const API_STATE& state;
|
||||||
const SWR_PS_STATE& psState;
|
const SWR_PS_STATE& psState;
|
||||||
const uint8_t clipDistanceMask;
|
const uint8_t clipDistanceMask;
|
||||||
uint8_t*& pDepthBase;
|
uint8_t*& pDepthBuffer;
|
||||||
uint8_t*& pStencilBase;
|
uint8_t*& pStencilBuffer;
|
||||||
};
|
};
|
||||||
|
|
||||||
INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
|
INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
|
||||||
|
|
Loading…
Reference in New Issue