swr/rast: Split backend.cpp to improve compile time

Hardcode split to four files currently.  Decreases swr build
time on a quad-core by ~10%.

Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com>
This commit is contained in:
Tim Rowley 2017-06-26 13:00:27 -05:00
parent b89bd3694c
commit cae53b24d7
13 changed files with 2146 additions and 1832 deletions

View File

@ -34,6 +34,7 @@ COMMON_CXXFLAGS = \
$(LLVM_CXXFLAGS) \
$(SWR_CXX11_CXXFLAGS) \
-I$(builddir)/rasterizer/codegen \
-I$(builddir)/rasterizer/core \
-I$(builddir)/rasterizer/jitter \
-I$(builddir)/rasterizer/archrast \
-I$(srcdir)/rasterizer \
@ -62,7 +63,11 @@ BUILT_SOURCES = \
rasterizer/archrast/gen_ar_event.cpp \
rasterizer/archrast/gen_ar_eventhandler.hpp \
rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
rasterizer/core/gen_BackendPixelRate0.cpp
rasterizer/core/backends/gen_BackendPixelRate0.cpp \
rasterizer/core/backends/gen_BackendPixelRate1.cpp \
rasterizer/core/backends/gen_BackendPixelRate2.cpp \
rasterizer/core/backends/gen_BackendPixelRate3.cpp \
rasterizer/core/backends/gen_BackendPixelRate.hpp
MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
@ -140,20 +145,33 @@ rasterizer/archrast/gen_ar_eventhandlerfile.hpp: rasterizer/codegen/gen_archrast
--output rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
--gen_eventhandlerfile_h
rasterizer/core/backends/gen_BackendPixelRate0.cpp \
rasterizer/core/backends/gen_BackendPixelRate1.cpp \
rasterizer/core/backends/gen_BackendPixelRate2.cpp \
rasterizer/core/backends/gen_BackendPixelRate3.cpp \
rasterizer/core/backends/gen_BackendPixelRate.hpp: \
backend.intermediate
# 5 SWR_MULTISAMPLE_TYPE_COUNT
# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
# 3 SWR_INPUT_COVERAGE_COUNT
# 2 centroid
# 2 forcedSampleCount
# 2 canEarlyZ
rasterizer/core/gen_BackendPixelRate0.cpp: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp
# use intermediate rule to tell make that all files can be
# generated in one invocation of gen_backends.py (prevents
# parallel make race condition)
.INTERMEDIATE: backend.intermediate
backend.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp rasterizer/codegen/templates/gen_header_init.hpp
$(MKDIR_GEN)
$(PYTHON_GEN) \
$(srcdir)/rasterizer/codegen/gen_backends.py \
--outdir rasterizer/core \
--outdir rasterizer/core/backends \
--dim 5 2 3 2 2 2 \
--split 0 \
--cpp
--numfiles 4 \
--cpp \
--hpp
COMMON_LIBADD = \
$(top_builddir)/src/gallium/auxiliary/libgallium.la \
@ -227,5 +245,6 @@ EXTRA_DIST = \
rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp \
rasterizer/codegen/templates/gen_backend.cpp \
rasterizer/codegen/templates/gen_builder.hpp \
rasterizer/codegen/templates/gen_header_init.hpp \
rasterizer/codegen/templates/gen_knobs.cpp \
rasterizer/codegen/templates/gen_llvm.hpp

View File

@ -73,7 +73,11 @@ CORE_CXX_SOURCES := \
rasterizer/core/api.h \
rasterizer/core/arena.h \
rasterizer/core/backend.cpp \
rasterizer/core/backend_clear.cpp \
rasterizer/core/backend_sample.cpp \
rasterizer/core/backend_singlesample.cpp \
rasterizer/core/backend.h \
rasterizer/core/backend_impl.h \
rasterizer/core/binner.cpp \
rasterizer/core/binner.h \
rasterizer/core/blend.h \

View File

@ -140,12 +140,22 @@ Depends('rasterizer/jitter/gen_state_llvm.h',
# 2 centroid
# 2 forcedSampleCount
# 2 canEarlyZ
backendPixelRateFileCount = 4
backendPixelRateFilePat = "rasterizer/core/backends/gen_BackendPixelRate%s.cpp"
backendPixelRateFiles = map(lambda x: backendPixelRateFilePat % x,
range(0, backendPixelRateFileCount))
env.CodeGenerate(
target = 'rasterizer/core/gen_BackendPixelRate0.cpp',
target = 'rasterizer/core/backends/gen_BackendPixelRate.hpp',
script = swrroot + 'rasterizer/codegen/gen_backends.py',
source = '',
command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp'
)
command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --dim 5 2 3 2 2 2 --numfiles ' + str(backendPixelRateFileCount) + ' --cpp --hpp'
)
Depends(backendPixelRateFiles,
['rasterizer/core/backends/gen_BackendPixelRate.hpp',
'rasterizer/archrast/gen_ar_event.hpp',
'rasterizer/codegen/gen_knobs.h']
)
Depends('rasterizer/jitter/gen_state_llvm.h',
swrroot + 'rasterizer/codegen/templates/gen_backend.cpp')
@ -153,9 +163,10 @@ Depends('rasterizer/jitter/gen_state_llvm.h',
built_sources = [
'rasterizer/codegen/gen_knobs.cpp',
'rasterizer/archrast/gen_ar_event.cpp',
'rasterizer/core/gen_BackendPixelRate0.cpp',
]
built_sources += backendPixelRateFiles
source = built_sources
source += env.ParseSourceList(swrroot + 'Makefile.sources', [
'CXX_SOURCES',

View File

@ -35,7 +35,9 @@ def main(args=sys.argv[1:]):
parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
args = parser.parse_args(args);
@ -43,11 +45,14 @@ def main(args=sys.argv[1:]):
class backendStrs :
def __init__(self) :
self.outFileName = 'gen_BackendPixelRate%s.cpp'
self.outHeaderName = 'gen_BackendPixelRate.hpp'
self.functionTableName = 'gBackendPixelRateTable'
self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
self.template = 'gen_backend.cpp'
self.hpp_template = 'gen_header_init.hpp'
self.cmakeFileName = 'gen_backends.cmake'
self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
self.tableName = 'BackendPixelRate'
backend = backendStrs()
@ -77,6 +82,8 @@ def main(args=sys.argv[1:]):
numFiles = 1
else:
numFiles = (len(output_list) + args.split - 1) // args.split
if (args.numfiles != 0):
numFiles = args.numfiles
linesPerFile = (len(output_list) + numFiles - 1) // numFiles
chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
@ -94,6 +101,18 @@ def main(args=sys.argv[1:]):
fileNum=fileNum,
funcList=chunkedList[fileNum])
if args.hpp:
baseHppName = os.path.join(args.outdir, backend.outHeaderName)
templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
MakoTemplateWriter.to_file(
templateHpp,
baseHppName,
cmdline=sys.argv,
numFiles=numFiles,
filename=backend.outHeaderName,
tableName=backend.tableName)
# generate gen_backend.cmake file
if args.cmake:
templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')

View File

@ -32,6 +32,7 @@
//============================================================================
#include "core/backend.h"
#include "core/backend_impl.h"
void InitBackendPixelRate${fileNum}()
{

View File

@ -0,0 +1,43 @@
//============================================================================
// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice (including the next
// paragraph) shall be included in all copies or substantial portions of the
// Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//
// @file ${filename}
//
// @brief auto-generated file
//
// DO NOT EDIT
//
// Generation Command Line:
// ${'\n// '.join(cmdline)}
//
//============================================================================
%for num in range(numFiles):
void Init${tableName}${num}();
%endfor
static INLINE void Init${tableName}()
{
%for num in range(numFiles):
Init${tableName}${num}();
%endfor
}

View File

@ -784,9 +784,6 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
// templated backend function tables
extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
void SetupPipeline(DRAW_CONTEXT *pDC)
{
@ -838,7 +835,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
break;
}
}
SWR_ASSERT(backendFuncs.pfnBackend);
PFN_PROCESS_PRIMS pfnBinner;
#if USE_SIMD16_FRONTEND
PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;

View File

@ -30,15 +30,14 @@
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include "backends/gen_BackendPixelRate.hpp"
#include <algorithm>
typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
//////////////////////////////////////////////////////////////////////////
/// @brief Process compute work.
@ -103,238 +102,6 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
SWR_ASSERT(x == 0 && y == 0);
}
template<SWR_FORMAT format>
void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
{
auto lambda = [&](int32_t comp)
{
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
};
const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
for (uint32_t i = 0; i < numIter; ++i)
{
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
}
}
#if USE_8x2_TILE_BACKEND
template<SWR_FORMAT format>
void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
{
auto lambda = [&](int32_t comp)
{
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
};
const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
for (uint32_t i = 0; i < numIter; ++i)
{
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
}
}
#endif
template<SWR_FORMAT format>
INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
{
// convert clear color to hottile format
// clear color is in RGBA float/uint32
#if USE_8x2_TILE_BACKEND
simd16vector vClear;
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
{
simd16scalar vComp;
vComp = _simd16_load1_ps((const float*)&clear[comp]);
if (FormatTraits<format>::isNormalized(comp))
{
vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
}
vComp = FormatTraits<format>::pack(comp, vComp);
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
}
#else
simdvector vClear;
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
{
simdscalar vComp;
vComp = _simd_load1_ps((const float*)&clear[comp]);
if (FormatTraits<format>::isNormalized(comp))
{
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
}
vComp = FormatTraits<format>::pack(comp, vComp);
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
}
#endif
uint32_t tileX, tileY;
MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
// Init to full macrotile
SWR_RECT clearTile =
{
KNOB_MACROTILE_X_DIM * int32_t(tileX),
KNOB_MACROTILE_Y_DIM * int32_t(tileY),
KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
};
// intersect with clear rect
clearTile &= rect;
// translate to local hottile origin
clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
// Make maximums inclusive (needed for convert to raster tiles)
clearTile.xmax -= 1;
clearTile.ymax -= 1;
// convert to raster tiles
clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
// compute steps between raster tile samples / raster tiles / macro tile rows
const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
// loop over all raster tiles in the current hot tile
for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
{
uint8_t* pRasterTile = pRasterTileRow;
for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
{
for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
{
ClearRasterTile<format>(pRasterTile, vClear);
pRasterTile += rasterTileSampleStep;
}
}
pRasterTileRow += macroTileRowStep;
}
pHotTile->state = HOTTILE_DIRTY;
}
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
{
SWR_CONTEXT *pContext = pDC->pContext;
if (KNOB_FAST_CLEAR)
{
CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
uint32_t numSamples = GetNumSamples(sampleCount);
SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
AR_BEGIN(BEClear, pDC->drawId);
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
{
unsigned long rt = 0;
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
while (_BitScanForward(&rt, mask))
{
mask &= ~(1 << rt);
HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
// All we want to do here is to mark the hot tile as being in a "needs clear" state.
pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
pHotTile->state = HOTTILE_CLEAR;
}
}
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
{
HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
pHotTile->state = HOTTILE_CLEAR;
}
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
{
HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
pHotTile->clearData[0] = pClear->clearStencil;
pHotTile->state = HOTTILE_CLEAR;
}
AR_END(BEClear, 1);
}
else
{
// Legacy clear
CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
AR_BEGIN(BEClear, pDC->drawId);
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
{
DWORD clearData[4];
clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
SWR_ASSERT(pfnClearTiles != nullptr);
unsigned long rt = 0;
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
while (_BitScanForward(&rt, mask))
{
mask &= ~(1 << rt);
pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
}
}
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
{
DWORD clearData[4];
clearData[0] = *(DWORD*)&pClear->clearDepth;
PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
SWR_ASSERT(pfnClearTiles != nullptr);
pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
}
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
{
DWORD clearData[4];
clearData[0] = pClear->clearStencil;
PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
}
AR_END(BEClear, 1);
}
}
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
SWR_RENDERTARGET_ATTACHMENT attachment)
{
@ -368,7 +135,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
// clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
if (pHotTile->state == HOTTILE_CLEAR)
{
PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
SWR_ASSERT(pfnClearTiles != nullptr);
pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
@ -429,457 +196,6 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3
}
}
#if KNOB_SIMD_WIDTH == 8
const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
#else
#error Unsupported vector width
#endif
simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
{
simdscalar vClipMask = _simd_setzero_ps();
uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
for (uint32_t i = 0; i < numClipDistance; ++i)
{
// pull triangle clip distance values from clip buffer
simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
// interpolate
simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
// clip if interpolated clip distance is < 0 || NAN
simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
vClipMask = _simd_or_ps(vClipMask, vCull);
}
return _simd_movemask_ps(vClipMask);
}
template<typename T>
void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(BESingleSampleBackend, pDC->drawId);
AR_BEGIN(BESetup, pDC->drawId);
const API_STATE &state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
SWR_PS_CONTEXT psContext;
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
SetupPixelShaderContext<T>(&psContext, samplePos, work);
AR_END(BESetup, 1);
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
simdmask coverageMask = work.coverageMask[0] & MASK;
if (coverageMask)
{
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
}
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
}
AR_BEGIN(BEBarycentric, pDC->drawId);
CalcPixelBarycentrics(coeffs, psContext);
CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
AR_END(BEBarycentric, 1);
// interpolate user clip distance if available
if (state.rastState.clipDistanceMask)
{
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
}
simdscalar vCoverageMask = vMask(coverageMask);
simdscalar depthPassMask = vCoverageMask;
simdscalar stencilPassMask = vCoverageMask;
// Early-Z?
if (T::bCanEarlyZ)
{
AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BEEarlyDepthTest, 0);
// early-exit if no pixels passed depth or earlyZ is forced on
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
{
goto Endtile;
}
}
}
psContext.sampleIndex = 0;
psContext.activeMask = _simd_castps_si(vCoverageMask);
// execute pixel shader
AR_BEGIN(BEPixelShader, pDC->drawId);
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
AR_END(BEPixelShader, 0);
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
// late-Z
if (!T::bCanEarlyZ)
{
AR_BEGIN(BELateDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BELateDepthTest, 0);
if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
goto Endtile;
}
} else {
// for early z, consolidate discards from shader
// into depthPassMask
depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
}
uint32_t statMask = _simd_movemask_ps(depthPassMask);
uint32_t statCount = _mm_popcnt_u32(statMask);
UPDATE_STAT_BE(DepthPassCount, statCount);
// output merger
AR_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
#else
OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
#endif
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
}
AR_END(BEOutputMerger, 0);
}
Endtile:
AR_BEGIN(BEEndTile, pDC->drawId);
work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
{
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
AR_END(BEEndTile, 0);
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
}
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
}
AR_END(BESingleSampleBackend, 0);
}
template<typename T>
void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(BESampleRateBackend, pDC->drawId);
AR_BEGIN(BESetup, pDC->drawId);
const API_STATE &state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
SWR_PS_CONTEXT psContext;
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
SetupPixelShaderContext<T>(&psContext, samplePos, work);
AR_END(BESetup, 0);
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
}
AR_BEGIN(BEBarycentric, pDC->drawId);
CalcPixelBarycentrics(coeffs, psContext);
CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
AR_END(BEBarycentric, 0);
for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
{
simdmask coverageMask = work.coverageMask[sample] & MASK;
if (coverageMask)
{
// offset depth/stencil buffers current sample
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
}
AR_BEGIN(BEBarycentric, pDC->drawId);
// calculate per sample positions
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
CalcSampleBarycentrics(coeffs, psContext);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
AR_END(BEBarycentric, 0);
// interpolate user clip distance if available
if (state.rastState.clipDistanceMask)
{
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
}
simdscalar vCoverageMask = vMask(coverageMask);
simdscalar depthPassMask = vCoverageMask;
simdscalar stencilPassMask = vCoverageMask;
// Early-Z?
if (T::bCanEarlyZ)
{
AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BEEarlyDepthTest, 0);
// early-exit if no samples passed depth or earlyZ is forced on.
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
{
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
continue;
}
}
}
psContext.sampleIndex = sample;
psContext.activeMask = _simd_castps_si(vCoverageMask);
// execute pixel shader
AR_BEGIN(BEPixelShader, pDC->drawId);
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
AR_END(BEPixelShader, 0);
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
// late-Z
if (!T::bCanEarlyZ)
{
AR_BEGIN(BELateDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BELateDepthTest, 0);
if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
continue;
}
}
uint32_t statMask = _simd_movemask_ps(depthPassMask);
uint32_t statCount = _mm_popcnt_u32(statMask);
UPDATE_STAT_BE(DepthPassCount, statCount);
// output merger
AR_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
#else
OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
#endif
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
}
AR_END(BEOutputMerger, 0);
}
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
Endtile:
ATTR_UNUSED;
AR_BEGIN(BEEndTile, pDC->drawId);
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
{
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
AR_END(BEEndTile, 0);
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
}
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
}
AR_END(BESampleRateBackend, 0);
}
// optimized backend flow with NULL PS
template<uint32_t sampleCountT>
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
@ -977,7 +293,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
UPDATE_STAT_BE(DepthPassCount, statCount);
}
Endtile:
Endtile:
ATTR_UNUSED;
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
@ -994,17 +310,7 @@ Endtile:
AR_END(BENullBackend, 0);
}
void InitClearTilesTable()
{
memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
}
PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
[2] // centroid
@ -1023,113 +329,10 @@ PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
[2] // canEarlyZ
= {};
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
// arguments to static template arguments.
template <uint32_t... ArgsT>
struct BEChooser
{
// Last Arg Terminator
static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
{
switch(tArg)
{
case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
default:
SWR_ASSERT(0 && "Invalid backend func\n");
return nullptr;
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
{
switch(tArg)
{
case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
default:
SWR_ASSERT(0 && "Invalid sample pattern\n");
return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
{
switch(tArg)
{
case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
default:
SWR_ASSERT(0 && "Invalid sample count\n");
return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
{
if(tArg == true)
{
return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
}
return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
}
};
void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
{
for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
{
for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
{
for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
{
table[inputCoverage][isCentroid][canEarlyZ] =
BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
(isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
}
}
}
}
void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
{
for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
{
for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
{
for(uint32_t centroid = 0; centroid < 2; centroid++)
{
for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
{
table[sampleCount][inputCoverage][centroid][canEarlyZ] =
BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
(centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
}
}
}
}
}
void InitBackendPixelRate0();
void InitBackendFuncTables()
{
InitBackendPixelRate();
InitBackendSingleFuncTable(gBackendSingleSample);
InitBackendPixelRate0();
InitBackendSampleFuncTable(gBackendSampleRateTable);
gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,281 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.cpp
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include <algorithm>
template<SWR_FORMAT format>
void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
{
auto lambda = [&](int32_t comp)
{
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
};
const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
for (uint32_t i = 0; i < numIter; ++i)
{
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
}
}
#if USE_8x2_TILE_BACKEND
template<SWR_FORMAT format>
void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
{
auto lambda = [&](int32_t comp)
{
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
};
const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
for (uint32_t i = 0; i < numIter; ++i)
{
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
}
}
#endif
template<SWR_FORMAT format>
INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
{
// convert clear color to hottile format
// clear color is in RGBA float/uint32
#if USE_8x2_TILE_BACKEND
simd16vector vClear;
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
{
simd16scalar vComp;
vComp = _simd16_load1_ps((const float*)&clear[comp]);
if (FormatTraits<format>::isNormalized(comp))
{
vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
}
vComp = FormatTraits<format>::pack(comp, vComp);
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
}
#else
simdvector vClear;
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
{
simdscalar vComp;
vComp = _simd_load1_ps((const float*)&clear[comp]);
if (FormatTraits<format>::isNormalized(comp))
{
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
}
vComp = FormatTraits<format>::pack(comp, vComp);
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
}
#endif
uint32_t tileX, tileY;
MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
// Init to full macrotile
SWR_RECT clearTile =
{
KNOB_MACROTILE_X_DIM * int32_t(tileX),
KNOB_MACROTILE_Y_DIM * int32_t(tileY),
KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
};
// intersect with clear rect
clearTile &= rect;
// translate to local hottile origin
clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
// Make maximums inclusive (needed for convert to raster tiles)
clearTile.xmax -= 1;
clearTile.ymax -= 1;
// convert to raster tiles
clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
// compute steps between raster tile samples / raster tiles / macro tile rows
const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
// loop over all raster tiles in the current hot tile
for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
{
uint8_t* pRasterTile = pRasterTileRow;
for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
{
for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
{
ClearRasterTile<format>(pRasterTile, vClear);
pRasterTile += rasterTileSampleStep;
}
}
pRasterTileRow += macroTileRowStep;
}
pHotTile->state = HOTTILE_DIRTY;
}
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
{
SWR_CONTEXT *pContext = pDC->pContext;
if (KNOB_FAST_CLEAR)
{
CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
uint32_t numSamples = GetNumSamples(sampleCount);
SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
AR_BEGIN(BEClear, pDC->drawId);
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
{
unsigned long rt = 0;
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
while (_BitScanForward(&rt, mask))
{
mask &= ~(1 << rt);
HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
// All we want to do here is to mark the hot tile as being in a "needs clear" state.
pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
pHotTile->state = HOTTILE_CLEAR;
}
}
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
{
HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
pHotTile->state = HOTTILE_CLEAR;
}
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
{
HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
pHotTile->clearData[0] = pClear->clearStencil;
pHotTile->state = HOTTILE_CLEAR;
}
AR_END(BEClear, 1);
}
else
{
// Legacy clear
CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
AR_BEGIN(BEClear, pDC->drawId);
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
{
DWORD clearData[4];
clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
SWR_ASSERT(pfnClearTiles != nullptr);
unsigned long rt = 0;
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
while (_BitScanForward(&rt, mask))
{
mask &= ~(1 << rt);
pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
}
}
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
{
DWORD clearData[4];
clearData[0] = *(DWORD*)&pClear->clearDepth;
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
SWR_ASSERT(pfnClearTiles != nullptr);
pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
}
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
{
DWORD clearData[4];
clearData[0] = pClear->clearStencil;
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
}
AR_END(BEClear, 1);
}
}
void InitClearTilesTable()
{
memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
gClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
gClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,345 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.cpp
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include <algorithm>
template<typename T>
void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(BESampleRateBackend, pDC->drawId);
AR_BEGIN(BESetup, pDC->drawId);
const API_STATE &state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
SWR_PS_CONTEXT psContext;
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
SetupPixelShaderContext<T>(&psContext, samplePos, work);
AR_END(BESetup, 0);
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
}
AR_BEGIN(BEBarycentric, pDC->drawId);
CalcPixelBarycentrics(coeffs, psContext);
CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
AR_END(BEBarycentric, 0);
for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
{
simdmask coverageMask = work.coverageMask[sample] & MASK;
if (coverageMask)
{
// offset depth/stencil buffers current sample
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
}
AR_BEGIN(BEBarycentric, pDC->drawId);
// calculate per sample positions
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
CalcSampleBarycentrics(coeffs, psContext);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
AR_END(BEBarycentric, 0);
// interpolate user clip distance if available
if (state.rastState.clipDistanceMask)
{
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
}
simdscalar vCoverageMask = vMask(coverageMask);
simdscalar depthPassMask = vCoverageMask;
simdscalar stencilPassMask = vCoverageMask;
// Early-Z?
if (T::bCanEarlyZ)
{
AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BEEarlyDepthTest, 0);
// early-exit if no samples passed depth or earlyZ is forced on.
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
{
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
continue;
}
}
}
psContext.sampleIndex = sample;
psContext.activeMask = _simd_castps_si(vCoverageMask);
// execute pixel shader
AR_BEGIN(BEPixelShader, pDC->drawId);
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
AR_END(BEPixelShader, 0);
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
// late-Z
if (!T::bCanEarlyZ)
{
AR_BEGIN(BELateDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BELateDepthTest, 0);
if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
continue;
}
}
uint32_t statMask = _simd_movemask_ps(depthPassMask);
uint32_t statCount = _mm_popcnt_u32(statMask);
UPDATE_STAT_BE(DepthPassCount, statCount);
// output merger
AR_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
#else
OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
#endif
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
}
AR_END(BEOutputMerger, 0);
}
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
Endtile:
ATTR_UNUSED;
AR_BEGIN(BEEndTile, pDC->drawId);
if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
{
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
AR_END(BEEndTile, 0);
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
}
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
}
AR_END(BESampleRateBackend, 0);
}
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
// arguments to static template arguments.
template <uint32_t... ArgsT>
struct BEChooserSampleRate
{
// Last Arg Terminator
static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
{
switch (tArg)
{
case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
case SWR_BACKEND_SINGLE_SAMPLE:
case SWR_BACKEND_MSAA_PIXEL_RATE:
SWR_ASSERT(0 && "Invalid backend func\n");
return nullptr;
break;
default:
SWR_ASSERT(0 && "Invalid backend func\n");
return nullptr;
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
{
switch (tArg)
{
case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
default:
SWR_ASSERT(0 && "Invalid sample pattern\n");
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
{
switch (tArg)
{
case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
default:
SWR_ASSERT(0 && "Invalid sample count\n");
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
{
if (tArg == true)
{
return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
}
return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
}
};
void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
{
for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
{
for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
{
for (uint32_t centroid = 0; centroid < 2; centroid++)
{
for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
{
table[sampleCount][inputCoverage][centroid][canEarlyZ] =
BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
(centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
}
}
}
}
}

View File

@ -0,0 +1,321 @@
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.cpp
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#include <smmintrin.h>
#include "backend.h"
#include "backend_impl.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
#include "core/multisample.h"
#include <algorithm>
template<typename T>
void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(BESingleSampleBackend, pDC->drawId);
AR_BEGIN(BESetup, pDC->drawId);
const API_STATE &state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
SWR_PS_CONTEXT psContext;
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
SetupPixelShaderContext<T>(&psContext, samplePos, work);
AR_END(BESetup, 1);
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
simdmask coverageMask = work.coverageMask[0] & MASK;
if (coverageMask)
{
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
}
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
}
AR_BEGIN(BEBarycentric, pDC->drawId);
CalcPixelBarycentrics(coeffs, psContext);
CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
AR_END(BEBarycentric, 1);
// interpolate user clip distance if available
if (state.rastState.clipDistanceMask)
{
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
}
simdscalar vCoverageMask = vMask(coverageMask);
simdscalar depthPassMask = vCoverageMask;
simdscalar stencilPassMask = vCoverageMask;
// Early-Z?
if (T::bCanEarlyZ)
{
AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BEEarlyDepthTest, 0);
// early-exit if no pixels passed depth or earlyZ is forced on
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
{
goto Endtile;
}
}
}
psContext.sampleIndex = 0;
psContext.activeMask = _simd_castps_si(vCoverageMask);
// execute pixel shader
AR_BEGIN(BEPixelShader, pDC->drawId);
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
AR_END(BEPixelShader, 0);
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
// late-Z
if (!T::bCanEarlyZ)
{
AR_BEGIN(BELateDepthTest, pDC->drawId);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
AR_END(BELateDepthTest, 0);
if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
goto Endtile;
}
} else {
// for early z, consolidate discards from shader
// into depthPassMask
depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
}
uint32_t statMask = _simd_movemask_ps(depthPassMask);
uint32_t statCount = _mm_popcnt_u32(statMask);
UPDATE_STAT_BE(DepthPassCount, statCount);
// output merger
AR_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
#else
OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
#endif
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
}
AR_END(BEOutputMerger, 0);
}
Endtile:
AR_BEGIN(BEEndTile, pDC->drawId);
work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
{
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
{
pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
AR_END(BEEndTile, 0);
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
}
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
}
AR_END(BESingleSampleBackend, 0);
}
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
// arguments to static template arguments.
template <uint32_t... ArgsT>
struct BEChooserSingleSample
{
// Last Arg Terminator
static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
{
switch(tArg)
{
case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
case SWR_BACKEND_MSAA_PIXEL_RATE:
case SWR_BACKEND_MSAA_SAMPLE_RATE:
default:
SWR_ASSERT(0 && "Invalid backend func\n");
return nullptr;
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
{
switch(tArg)
{
case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
default:
SWR_ASSERT(0 && "Invalid sample pattern\n");
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
{
switch(tArg)
{
case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
default:
SWR_ASSERT(0 && "Invalid sample count\n");
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
break;
}
}
// Recursively parse args
template <typename... TArgsT>
static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
{
if(tArg == true)
{
return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
}
return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
}
};
void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
{
for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
{
for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
{
for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
{
table[inputCoverage][isCentroid][canEarlyZ] =
BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
(isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
}
}
}
}