From 0cc7c46cf43c9e6a2bed3099f1a83d98ac52a7fc Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 26 Jun 2017 12:41:38 -0500 Subject: [PATCH] swr/rast: Split rasterizer.cpp to improve compile time Hardcode split to four files currently. Decreases swr build time on KNL by over 50%. Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/Makefile.am | 39 +- src/gallium/drivers/swr/Makefile.sources | 2 +- src/gallium/drivers/swr/SConscript | 24 +- .../swr/rasterizer/codegen/gen_backends.py | 12 + .../codegen/templates/gen_rasterizer.cpp | 42 + .../drivers/swr/rasterizer/core/api.cpp | 1 + .../swr/rasterizer/core/multisample.cpp | 48 - .../swr/rasterizer/core/rasterizer.cpp | 1810 +++-------------- .../drivers/swr/rasterizer/core/rasterizer.h | 31 +- .../swr/rasterizer/core/rasterizer_impl.h | 1376 +++++++++++++ 10 files changed, 1750 insertions(+), 1635 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp delete mode 100644 src/gallium/drivers/swr/rasterizer/core/multisample.cpp create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am index 578f15909b6..4b4bd3793b3 100644 --- a/src/gallium/drivers/swr/Makefile.am +++ b/src/gallium/drivers/swr/Makefile.am @@ -67,7 +67,12 @@ BUILT_SOURCES = \ rasterizer/core/backends/gen_BackendPixelRate1.cpp \ rasterizer/core/backends/gen_BackendPixelRate2.cpp \ rasterizer/core/backends/gen_BackendPixelRate3.cpp \ - rasterizer/core/backends/gen_BackendPixelRate.hpp + rasterizer/core/backends/gen_BackendPixelRate.hpp \ + rasterizer/core/backends/gen_rasterizer0.cpp \ + rasterizer/core/backends/gen_rasterizer1.cpp \ + rasterizer/core/backends/gen_rasterizer2.cpp \ + rasterizer/core/backends/gen_rasterizer3.cpp \ + rasterizer/core/backends/gen_rasterizer.hpp MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) @@ -173,6 +178,35 @@ backend.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/temp --cpp \ --hpp +rasterizer/core/backends/gen_rasterizer0.cpp \ +rasterizer/core/backends/gen_rasterizer1.cpp \ +rasterizer/core/backends/gen_rasterizer2.cpp \ +rasterizer/core/backends/gen_rasterizer3.cpp \ +rasterizer/core/backends/gen_rasterizer.hpp: \ +rasterizer.intermediate + +# 5 SWR_MULTISAMPLE_TYPE_COUNT +# 2 CenterPattern +# 2 Conservative +# 3 SWR_INPUT_COVERAGE_COUNT +# 5 STATE_VALID_TRI_EDGE_COUNT +# 2 RasterScissorEdges + +# use intermediate rule to tell make that all files can be +# generated in one invocation of gen_backends.py (prevents +# parallel make race condition) +.INTERMEDIATE: rasterizer.intermediate +rasterizer.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_rasterizer.cpp rasterizer/codegen/templates/gen_header_init.hpp + $(MKDIR_GEN) + $(PYTHON_GEN) \ + $(srcdir)/rasterizer/codegen/gen_backends.py \ + --outdir rasterizer/core/backends \ + --rast \ + --dim 5 2 2 3 5 2 \ + --numfiles 4 \ + --cpp \ + --hpp + COMMON_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/mesa/libmesagallium.la \ @@ -247,4 +281,5 @@ EXTRA_DIST = \ rasterizer/codegen/templates/gen_builder.hpp \ rasterizer/codegen/templates/gen_header_init.hpp \ rasterizer/codegen/templates/gen_knobs.cpp \ - rasterizer/codegen/templates/gen_llvm.hpp + rasterizer/codegen/templates/gen_llvm.hpp \ + rasterizer/codegen/templates/gen_rasterizer.cpp diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index d9894c26015..12a5e7d137f 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -95,12 +95,12 @@ CORE_CXX_SOURCES := \ rasterizer/core/frontend.h \ rasterizer/core/knobs.h \ rasterizer/core/knobs_init.h \ - rasterizer/core/multisample.cpp \ rasterizer/core/multisample.h \ rasterizer/core/pa_avx.cpp \ rasterizer/core/pa.h \ rasterizer/core/rasterizer.cpp \ rasterizer/core/rasterizer.h \ + rasterizer/core/rasterizer_impl.h \ rasterizer/core/rdtsc_core.cpp \ rasterizer/core/rdtsc_core.h \ rasterizer/core/ringbuffer.h \ diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript index 0f3cd6c8aa3..512269afbef 100644 --- a/src/gallium/drivers/swr/SConscript +++ b/src/gallium/drivers/swr/SConscript @@ -156,6 +156,28 @@ Depends(backendPixelRateFiles, 'rasterizer/codegen/gen_knobs.h'] ) +# 5 SWR_MULTISAMPLE_TYPE_COUNT +# 2 CenterPattern +# 2 Conservative +# 3 SWR_INPUT_COVERAGE_COUNT +# 5 STATE_VALID_TRI_EDGE_COUNT +# 2 RasterScissorEdges +genRasterizerFileCount = 4 +genRasterizerFilePat = "rasterizer/core/backends/gen_rasterizer%s.cpp" +genRasterizerFiles = map(lambda x: genRasterizerFilePat % x, + range(0, genRasterizerFileCount)) +env.CodeGenerate( + target = 'rasterizer/core/backends/gen_rasterizer.hpp', + script = swrroot + 'rasterizer/codegen/gen_backends.py', + source = '', + command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --rast --dim 5 2 2 3 5 2 --numfiles ' + str(genRasterizerFileCount) + ' --cpp --hpp' + ) +Depends(genRasterizerFiles, + ['rasterizer/core/backends/gen_rasterizer.hpp', + 'rasterizer/archrast/gen_ar_event.hpp', + 'rasterizer/codegen/gen_knobs.h'] + ) + Depends('rasterizer/jitter/gen_state_llvm.h', swrroot + 'rasterizer/codegen/templates/gen_backend.cpp') @@ -165,7 +187,7 @@ built_sources = [ 'rasterizer/archrast/gen_ar_event.cpp', ] -built_sources += backendPixelRateFiles +built_sources += [backendPixelRateFiles, genRasterizerFiles] source = built_sources source += env.ParseSourceList(swrroot + 'Makefile.sources', [ diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py index 2fc91d1f915..414a04e38a8 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py @@ -39,6 +39,7 @@ def main(args=sys.argv[1:]): parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False) parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False) parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False) + parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False) args = parser.parse_args(args) @@ -55,6 +56,17 @@ def main(args=sys.argv[1:]): self.cmakeSrcVar = 'GEN_BACKEND_SOURCES' self.tableName = 'BackendPixelRate' + if args.rast: + self.outFileName = 'gen_rasterizer%s.cpp' + self.outHeaderName = 'gen_rasterizer.hpp' + self.functionTableName = 'gRasterizerFuncs' + self.funcInstanceHeader = ' = RasterizeTriangle::samplePosXi[1]; -constexpr uint32_t MultisampleTraits::samplePosYi[1]; -constexpr uint32_t MultisampleTraits::samplePosXi[2]; -constexpr uint32_t MultisampleTraits::samplePosYi[2]; -constexpr uint32_t MultisampleTraits::samplePosXi[4]; -constexpr uint32_t MultisampleTraits::samplePosYi[4]; -constexpr uint32_t MultisampleTraits::samplePosXi[8]; -constexpr uint32_t MultisampleTraits::samplePosYi[8]; -constexpr uint32_t MultisampleTraits::samplePosXi[16]; -constexpr uint32_t MultisampleTraits::samplePosYi[16]; - -constexpr float MultisampleTraits::samplePosX[1]; -constexpr float MultisampleTraits::samplePosY[1]; -constexpr float MultisampleTraits::samplePosX[2]; -constexpr float MultisampleTraits::samplePosY[2]; -constexpr float MultisampleTraits::samplePosX[4]; -constexpr float MultisampleTraits::samplePosY[4]; -constexpr float MultisampleTraits::samplePosX[8]; -constexpr float MultisampleTraits::samplePosY[8]; -constexpr float MultisampleTraits::samplePosX[16]; -constexpr float MultisampleTraits::samplePosY[16]; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 4df146e9f99..a3ff557bd8c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -30,974 +30,35 @@ #include #include "rasterizer.h" +#include "backends/gen_rasterizer.hpp" #include "rdtsc_core.h" #include "backend.h" #include "utils.h" #include "frontend.h" #include "tilemgr.h" #include "memory/tilingtraits.h" +#include "rasterizer_impl.h" -template -void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); -template -void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers); -template -void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); +PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2]; -#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} -const __m256d gMaskToVecpd[] = -{ - MASKTOVEC(0, 0, 0, 0), - MASKTOVEC(0, 0, 0, 1), - MASKTOVEC(0, 0, 1, 0), - MASKTOVEC(0, 0, 1, 1), - MASKTOVEC(0, 1, 0, 0), - MASKTOVEC(0, 1, 0, 1), - MASKTOVEC(0, 1, 1, 0), - MASKTOVEC(0, 1, 1, 1), - MASKTOVEC(1, 0, 0, 0), - MASKTOVEC(1, 0, 0, 1), - MASKTOVEC(1, 0, 1, 0), - MASKTOVEC(1, 0, 1, 1), - MASKTOVEC(1, 1, 0, 0), - MASKTOVEC(1, 1, 0, 1), - MASKTOVEC(1, 1, 1, 0), - MASKTOVEC(1, 1, 1, 1), -}; - -struct POS -{ - int32_t x, y; -}; - -struct EDGE -{ - double a, b; // a, b edge coefficients in fix8 - double stepQuadX; // step to adjacent horizontal quad in fix16 - double stepQuadY; // step to adjacent vertical quad in fix16 - double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 - double stepRasterTileY; // step to adjacent vertical raster tile in fix16 - - __m256d vQuadOffsets; // offsets for 4 samples of a quad - __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief rasterize a raster tile partially covered by the triangle -/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile -/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) -/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. -/// Used to step between quads when sweeping over the raster tile. -template -INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges) -{ - uint64_t coverageMask = 0; - - __m256d vEdges[NumEdges]; - __m256d vStepX[NumEdges]; - __m256d vStepY[NumEdges]; - - for (uint32_t e = 0; e < NumEdges; ++e) - { - // Step to the pixel sample locations of the 1st quad - vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets); - - // compute step to next quad (mul by 2 in x and y direction) - vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX); - vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY); - } - - // fast unrolled version for 8x8 tile -#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 - int edgeMask[NumEdges]; - uint64_t mask; - - auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);}; - auto update_lambda = [&](int e){mask &= edgeMask[e];}; - auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);}; - auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);}; - auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);}; - -// evaluate which pixels in the quad are covered -#define EVAL \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); - - // update coverage mask - // if edge 0 is degenerate and will be skipped; init the mask -#define UPDATE_MASK(bit) \ - if(std::is_same::value || std::is_same::value){\ - mask = 0xf;\ - }\ - else{\ - mask = edgeMask[0]; \ - }\ - UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ - coverageMask |= (mask << bit); - - // step in the +x direction to the next quad -#define INCX \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); - - // step in the +y direction to the next quad -#define INCY \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); - - // step in the -x direction to the next quad -#define DECX \ - UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); - - // sweep 2x2 quad back and forth through the raster tile, - // computing coverage masks for the entire tile - - // raster tile - // 0 1 2 3 4 5 6 7 - // x x - // x x ------------------> - // x x | - // <-----------------x x V - // .. - - // row 0 - EVAL; - UPDATE_MASK(0); - INCX; - EVAL; - UPDATE_MASK(4); - INCX; - EVAL; - UPDATE_MASK(8); - INCX; - EVAL; - UPDATE_MASK(12); - INCY; - - //row 1 - EVAL; - UPDATE_MASK(28); - DECX; - EVAL; - UPDATE_MASK(24); - DECX; - EVAL; - UPDATE_MASK(20); - DECX; - EVAL; - UPDATE_MASK(16); - INCY; - - // row 2 - EVAL; - UPDATE_MASK(32); - INCX; - EVAL; - UPDATE_MASK(36); - INCX; - EVAL; - UPDATE_MASK(40); - INCX; - EVAL; - UPDATE_MASK(44); - INCY; - - // row 3 - EVAL; - UPDATE_MASK(60); - DECX; - EVAL; - UPDATE_MASK(56); - DECX; - EVAL; - UPDATE_MASK(52); - DECX; - EVAL; - UPDATE_MASK(48); -#else - uint32_t bit = 0; - for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) - { - __m256d vStartOfRowEdge[NumEdges]; - for (uint32_t e = 0; e < NumEdges; ++e) - { - vStartOfRowEdge[e] = vEdges[e]; - } - - for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) - { - int edgeMask[NumEdges]; - for (uint32_t e = 0; e < NumEdges; ++e) - { - edgeMask[e] = _mm256_movemask_pd(vEdges[e]); - } - - uint64_t mask = edgeMask[0]; - for (uint32_t e = 1; e < NumEdges; ++e) - { - mask &= edgeMask[e]; - } - coverageMask |= (mask << bit); - - // step to the next pixel in the x - for (uint32_t e = 0; e < NumEdges; ++e) - { - vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); - } - bit+=4; - } - - // step to the next row - for (uint32_t e = 0; e < NumEdges; ++e) - { - vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]); - } - } -#endif - return coverageMask; - -} -// Top left rule: -// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge -// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge -// Top left: a sample is in if it is a top or left edge. -// Out: !(horizontal && above) = !horizontal && below -// Out: !horizontal && left = !(!horizontal && left) = horizontal and right -INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) -{ - // if vA < 0, vC-- - // if vA == 0 && vB < 0, vC-- - - __m256d vEdgeOut = vEdge; - __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); - - // if vA < 0 (line is not horizontal and below) - int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); - - // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) - __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); - int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); - msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); - - // if either of these are true and we're on the line (edge == 0), bump it outside the line - vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief calculates difference in precision between the result of manh -/// calculation and the edge precision, based on compile time trait values -template -constexpr int64_t ManhToEdgePrecisionAdjust() -{ - static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, - "Inadequate precision of result of manh calculation "); - return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value); -} - -////////////////////////////////////////////////////////////////////////// -/// @struct adjustEdgeConservative -/// @brief Primary template definition used for partially specializing -/// the adjustEdgeConservative function. This struct should never -/// be instantiated. -/// @tparam RT: rasterizer traits -/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting? -template -struct adjustEdgeConservative -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs calculations to adjust each edge of a triangle away - /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y - /// direction. - /// - /// Uncertainty regions arise from fixed point rounding, which - /// can snap a vertex +/- by min fixed point value. - /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. - /// This allows the rasterizer to test for coverage only at the pixel center, - /// instead of having to test individual pixel corners for conservative coverage - INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) - { - // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away - // from the pixel center (in the direction of the edge normal A/B) - - // edge = Ax + Bx + C - (manh/e) - // manh = manhattan distance = abs(A) + abs(B) - // e = absolute rounding error from snapping from float to fixed point precision - - // 'fixed point' multiply (in double to be avx1 friendly) - // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example - __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); - __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), - _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); - - static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, - "Inadequate precision of result of manh calculation "); - - // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision - // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right - manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust() * 0.5)); - - // move the edge away from the pixel center by the required conservative precision + 1/2 pixel - // this allows the rasterizer to do a single conservative coverage test to see if the primitive - // intersects the pixel at all - vEdge = _mm256_sub_pd(vEdge, manh); - }; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief adjustEdgeConservative specialization where no edge offset is needed -template -struct adjustEdgeConservative> -{ - INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief calculates the distance a degenerate BBox needs to be adjusted -/// for conservative rast based on compile time trait values -template -constexpr int64_t ConservativeScissorOffset() -{ - static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision"); - // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges - typedef std::integral_constant DegenerateEdgeOffsetT; - // 1/2 pixel edge offset + conservative offset - degenerateTriangle - return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Performs calculations to adjust each a vector of evaluated edges out -/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y -/// direction. -template -INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge) -{ - int64_t aabs = std::abs(static_cast(a)), babs = std::abs(static_cast(b)); - int64_t manh = ((aabs * ConservativeScissorOffset()) + (babs * ConservativeScissorOffset())) >> ManhToEdgePrecisionAdjust(); - vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Performs calculations to adjust each a scalar evaluated edge out -/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y -/// direction. -template -INLINE double adjustScalarEdge(const double a, const double b, const double Edge) -{ - int64_t aabs = std::abs(static_cast(a)), babs = std::abs(static_cast(b)); - int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust(); - return (Edge - manh); -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Perform any needed adjustments to evaluated triangle edges -template -struct adjustEdgesFix16 -{ - INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) - { - static_assert(std::is_same>::value, - "Edge equation expected to be in x.16 fixed point"); - - static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled"); - - // need to apply any edge offsets before applying the top-left rule - adjustEdgeConservative(vAi, vBi, vEdge); - - adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Perform top left adjustments to evaluated triangle edges -template -struct adjustEdgesFix16> -{ - INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) - { - adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); - } -}; - -// max(abs(dz/dx), abs(dz,dy) -INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) -{ - /* - // evaluate i,j at (0,0) - float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; - float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; - - // evaluate i,j at (1,0) - float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; - float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; - - // compute dz/dx - float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; - float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; - float dzdx = abs(d10 - d00); - - // evaluate i,j at (0,1) - float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; - float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; - - float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; - float dzdy = abs(d01 - d00); - */ - - // optimized version of above - float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); - float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); - - return std::max(dzdx, dzdy); -} - -INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) -{ - if (pState->depthFormat == R24_UNORM_X8_TYPELESS) - { - return (1.0f / (1 << 24)); - } - else if (pState->depthFormat == R16_UNORM) - { - return (1.0f / (1 << 16)); - } - else - { - SWR_ASSERT(pState->depthFormat == R32_FLOAT); - - // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) - float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); - uint32_t zMaxInt = *(uint32_t*)&zMax; - zMaxInt &= 0x7f800000; - zMax = *(float*)&zMaxInt; - - return zMax * (1.0f / (1 << 23)); - } -} - -INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) -{ - if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) - { - return 0.0f; - } - - float scale = pState->slopeScaledDepthBias; - if (scale != 0.0f) - { - scale *= ComputeMaxDepthSlope(pTri); - } - - float bias = pState->depthBias; - if (!pState->depthBiasPreAdjusted) - { - bias *= ComputeBiasFactor(pState, pTri, z); - } - bias += scale; - - if (pState->depthBiasClamp > 0.0f) - { - bias = std::min(bias, pState->depthBiasClamp); - } - else if (pState->depthBiasClamp < 0.0f) - { - bias = std::max(bias, pState->depthBiasClamp); - } - - return bias; -} - -// Prevent DCE by writing coverage mask from rasterizer to volatile -#if KNOB_ENABLE_TOSS_POINTS -__declspec(thread) volatile uint64_t gToss; -#endif - -static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; -// try to avoid _chkstk insertions; make this thread local -static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib]; - -INLINE -void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) -{ - edge.a = a; - edge.b = b; - - // compute constant steps to adjacent quads - edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE)); - edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE)); - - // compute constant steps to adjacent raster tiles - edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE)); - edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE)); - - // compute quad offsets - const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); - const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); - - __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); - __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); - edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); - - // compute raster tile offsets - const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0); - const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0); - - __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); - __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); - edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16); -} - -INLINE -void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) -{ - ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary template definition used for partially specializing -/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel -/// corner to sample position, and test for coverage -/// @tparam sampleCount: multisample count -template -INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, - int32_t &mask0, int32_t &mask1, int32_t &mask2) -{ - __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; - // evaluate edge equations at the tile multisample bounding box - vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); - vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); - vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); - mask0 = _mm256_movemask_pd(vSampleBboxTest0); - mask1 = _mm256_movemask_pd(vSampleBboxTest1); - mask2 = _mm256_movemask_pd(vSampleBboxTest2); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief UpdateEdgeMasks specialization, instantiated -/// when only rasterizing a single coverage test point -template <> -INLINE void UpdateEdgeMasks(const __m256d(&)[3], const __m256d* vEdgeFix16, - int32_t &mask0, int32_t &mask1, int32_t &mask2) -{ - mask0 = _mm256_movemask_pd(vEdgeFix16[0]); - mask1 = _mm256_movemask_pd(vEdgeFix16[1]); - mask2 = _mm256_movemask_pd(vEdgeFix16[2]); -} - -////////////////////////////////////////////////////////////////////////// -/// @struct ComputeScissorEdges -/// @brief Primary template definition. Allows the function to be generically -/// called. When paired with below specializations, will result in an empty -/// inlined function if scissor is not enabled -/// @tparam RasterScissorEdgesT: is scissor enabled? -/// @tparam IsConservativeT: is conservative rast enabled? -/// @tparam RT: rasterizer traits -template -struct ComputeScissorEdges -{ - INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ComputeScissorEdges partial -/// specialization. Instantiated when conservative rast and scissor are enabled -template -struct ComputeScissorEdges -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, - /// evaluate edge equations and offset them away from pixel center. - INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) - { - // if conservative rasterizing, triangle bbox intersected with scissor bbox is used - SWR_RECT scissor; - scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin); - scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax); - scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin); - scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax); - - POS topLeft{scissor.xmin, scissor.ymin}; - POS bottomLeft{scissor.xmin, scissor.ymax}; - POS topRight{scissor.xmax, scissor.ymin}; - POS bottomRight{scissor.xmax, scissor.ymax}; - - // construct 4 scissor edges in ccw direction - ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); - ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); - ComputeEdgeData(bottomRight, topRight, rastEdges[5]); - ComputeEdgeData(topRight, topLeft, rastEdges[6]); - - vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); - vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); - vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); - vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); - - // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing - adjustScissorEdge(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); - adjustScissorEdge(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); - adjustScissorEdge(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); - adjustScissorEdge(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]); - - // Upper left rule for scissor - vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); - vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief ComputeScissorEdges partial -/// specialization. Instantiated when scissor is enabled and conservative rast -/// is disabled. -template -struct ComputeScissorEdges -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Compute scissor edge vectors and evaluate edge equations - INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, - EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) - { - const SWR_RECT &scissor = scissorBBox; - POS topLeft{scissor.xmin, scissor.ymin}; - POS bottomLeft{scissor.xmin, scissor.ymax}; - POS topRight{scissor.xmax, scissor.ymin}; - POS bottomRight{scissor.xmax, scissor.ymax}; - - // construct 4 scissor edges in ccw direction - ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); - ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); - ComputeEdgeData(bottomRight, topRight, rastEdges[5]); - ComputeEdgeData(topRight, topLeft, rastEdges[6]); - - vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); - vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); - vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); - vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); - - // Upper left rule for scissor - vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); - vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for TrivialRejectTest. Should -/// never be called, but TemplateUnroller instantiates a few unused values, -/// so it calls a runtime assert instead of a static_assert. -template -INLINE bool TrivialRejectTest(const int, const int, const int) -{ - SWR_INVALID("Primary templated function should never be called"); - return false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0 -/// and edge 1 for trivial coverage reject -template <> -INLINE bool TrivialRejectTest(const int mask0, const int mask1, const int) -{ - return (!(mask0 && mask1)) ? true : false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0 -/// and edge 2 for trivial coverage reject -template <> -INLINE bool TrivialRejectTest(const int mask0, const int, const int mask2) -{ - return (!(mask0 && mask2)) ? true : false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1 -/// and edge 2 for trivial coverage reject -template <> -INLINE bool TrivialRejectTest(const int, const int mask1, const int mask2) -{ - return (!(mask1 && mask2)) ? true : false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all -/// primitive edges for trivial coverage reject -template <> -INLINE bool TrivialRejectTest(const int mask0, const int mask1, const int mask2) -{ - return (!(mask0 && mask1 && mask2)) ? true : false;; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate -/// point, so return false and rasterize against conservative BBox -template <> -INLINE bool TrivialRejectTest(const int, const int, const int) -{ - return false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for TrivialAcceptTest. Always returns -/// false, since it will only be called for degenerate tris, and as such -/// will never cover the entire raster tile -template -INLINE bool TrivialAcceptTest(const int, const int, const int) -{ - return false; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all -/// edge masks for a fully covered raster tile -template <> -INLINE bool TrivialAcceptTest(const int mask0, const int mask1, const int mask2) -{ - return ((mask0 & mask1 & mask2) == 0xf); -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for GenerateSVInnerCoverage. Results -/// in an empty function call if SVInnerCoverage isn't requested -template -struct GenerateSVInnerCoverage -{ - INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Specialization of GenerateSVInnerCoverage where all edges -/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated -/// edge values from OuterConservative to InnerConservative and rasterizes. -template -struct GenerateSVInnerCoverage -{ - INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask) - { - SWR_CONTEXT *pContext = pDC->pContext; - - double startQuadEdgesAdj[RT::NumEdgesT::value]; - for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - startQuadEdgesAdj[e] = adjustScalarEdge(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); - } - - // not trivial accept or reject, must rasterize full tile - AR_BEGIN(BERasterizePartial, pDC->drawId); - innerCoverageMask = rasterizePartialTile(pDC, startQuadEdgesAdj, pRastEdges); - AR_END(BERasterizePartial, 0); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results -/// in an empty function call if SVInnerCoverage isn't requested -template -struct UpdateEdgeMasksInnerConservative -{ - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*, - const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){}; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges -/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges -/// evaluated at raster tile corners to inner conservative position and -/// updates edge masks -template -struct UpdateEdgeMasksInnerConservative -{ - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, - const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2) - { - __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; - - // instead of keeping 2 copies of evaluated edges around, just compensate for the outer - // conservative evaluated edge when adjusting the edge in for inner conservative tests - adjustEdgeConservative(vAi, vBi, vTempEdge[0]); - adjustEdgeConservative(vAi, vBi, vTempEdge[1]); - adjustEdgeConservative(vAi, vBi, vTempEdge[2]); - - UpdateEdgeMasks(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); - } -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage -/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot -/// cover an entire raster tile, set mask0 to 0 to force it down the -/// rastierizePartialTile path -template -struct UpdateEdgeMasksInnerConservative -{ - INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*, - const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &) - { - // set one mask to zero to force the triangle down the rastierizePartialTile path - mask0 = 0; - } -}; - -template -void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) +void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) { SWR_CONTEXT *pContext = pDC->pContext; - const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); + const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); #if KNOB_ENABLE_TOSS_POINTS if (KNOB_TOSS_BIN_TRIS) { return; } #endif - AR_BEGIN(BERasterizeTriangle, pDC->drawId); - AR_BEGIN(BETriangleSetup, pDC->drawId); + + // bloat line to two tris and call the triangle rasterizer twice + AR_BEGIN(BERasterizeLine, pDC->drawId); const API_STATE &state = GetApiState(pDC); const SWR_RASTSTATE &rastState = state.rastState; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; - OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; - triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; - - __m128 vX, vY, vZ, vRecipW; - - // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care - // eg: vX = [x0 x1 x2 dc] - vX = _mm_load_ps(workDesc.pTriBuffer); - vY = _mm_load_ps(workDesc.pTriBuffer + 4); - vZ = _mm_load_ps(workDesc.pTriBuffer + 8); - vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); - - // convert to fixed point - static_assert(std::is_same>::value, "Rasterizer expects 16.8 fixed point precision"); - __m128i vXi = fpToFixedPoint(vX); - __m128i vYi = fpToFixedPoint(vY); - - // quantize floating point position to fixed point precision - // to prevent attribute creep around the triangle vertices - vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); - vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); - - // triangle setup - A and B edge equation coefs - __m128 vA, vB; - triangleSetupAB(vX, vY, vA, vB); - - __m128i vAi, vBi; - triangleSetupABInt(vXi, vYi, vAi, vBi); - - // determinant - float det = calcDeterminantInt(vAi, vBi); - - // Verts in Pixel Coordinate Space at this point - // Det > 0 = CW winding order - // Convert CW triangles to CCW - if (det > 0.0) - { - vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); - vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); - vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); - vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); - det = -det; - } - - __m128 vC; - // Finish triangle setup - C edge coef - triangleSetupC(vX, vY, vA, vB, vC); - - if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) - { - // If we have degenerate edge(s) to rasterize, set I and J coefs - // to 0 for constant interpolation of attributes - triDesc.I[0] = 0.0f; - triDesc.I[1] = 0.0f; - triDesc.I[2] = 0.0f; - triDesc.J[0] = 0.0f; - triDesc.J[1] = 0.0f; - triDesc.J[2] = 0.0f; - - // Degenerate triangles have no area - triDesc.recipDet = 0.0f; - } - else - { - // only extract coefs for 2 of the barycentrics; the 3rd can be - // determined from the barycentric equation: - // i + j + k = 1 <=> k = 1 - j - i - _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); - _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); - _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); - _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); - _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); - _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); - - // compute recipDet, used to calculate barycentric i and j in the backend - triDesc.recipDet = 1.0f/det; - } - - OSALIGNSIMD(float) oneOverW[4]; - _mm_store_ps(oneOverW, vRecipW); - triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; - triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; - triDesc.OneOverW[2] = oneOverW[2]; - - // calculate perspective correct coefs per vertex attrib - float* pPerspAttribs = perspAttribsTLS; - float* pAttribs = workDesc.pAttribs; - triDesc.pPerspAttribs = pPerspAttribs; - triDesc.pAttribs = pAttribs; - float *pRecipW = workDesc.pTriBuffer + 12; - triDesc.pRecipW = pRecipW; - __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); - __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); - __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); - for(uint32_t i = 0; i < workDesc.numAttribs; i++) - { - __m128 attribA = _mm_load_ps(pAttribs); - __m128 attribB = _mm_load_ps(pAttribs+=4); - __m128 attribC = _mm_load_ps(pAttribs+=4); - pAttribs+=4; - - attribA = _mm_mul_ps(attribA, vOneOverWV0); - attribB = _mm_mul_ps(attribB, vOneOverWV1); - attribC = _mm_mul_ps(attribC, vOneOverWV2); - - _mm_store_ps(pPerspAttribs, attribA); - _mm_store_ps(pPerspAttribs+=4, attribB); - _mm_store_ps(pPerspAttribs+=4, attribC); - pPerspAttribs+=4; - } - - // compute bary Z - // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) - OSALIGNSIMD(float) a[4]; - _mm_store_ps(a, vZ); - triDesc.Z[0] = a[0] - a[2]; - triDesc.Z[1] = a[1] - a[2]; - triDesc.Z[2] = a[2]; - - // add depth bias - triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); - - // Calc bounding box of triangle - OSALIGNSIMD(SWR_RECT) bbox; - calcBoundingBoxInt(vXi, vYi, bbox); - - const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; - - if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) - { - // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid - bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++; - SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, - "Conservative rast degenerate handling requires a valid scissor rect"); - } - - // Intersect with scissor/viewport - OSALIGNSIMD(SWR_RECT) intersect; - intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); - intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); - intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); - intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); - - triDesc.triFlags = workDesc.triFlags; - - // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox + // macrotile dimensioning uint32_t macroX, macroY; MacroTileMgr::getTileIndices(macroTile, macroX, macroY); int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; @@ -1005,292 +66,251 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - intersect.xmin = std::max(intersect.xmin, macroBoxLeft); - intersect.ymin = std::max(intersect.ymin, macroBoxTop); - intersect.xmax = std::min(intersect.xmax, macroBoxRight); - intersect.ymax = std::min(intersect.ymax, macroBoxBottom); + const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; - SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0); + // create a copy of the triangle buffer to write our adjusted vertices to + OSALIGNSIMD(float) newTriBuffer[4 * 4]; + TRIANGLE_WORK_DESC newWorkDesc = workDesc; + newWorkDesc.pTriBuffer = &newTriBuffer[0]; - AR_END(BETriangleSetup, 0); + // create a copy of the attrib buffer to write our adjusted attribs to + OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS]; + newWorkDesc.pAttribs = &newAttribBuffer[0]; - // update triangle desc - uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - uint32_t numTilesX = maxTileX - minTileX + 1; - uint32_t numTilesY = maxTileY - minTileY + 1; + const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f); + const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f); - if (numTilesX == 0 || numTilesY == 0) + __m128 vX, vY, vZ, vRecipW; + + vX = _mm_load_ps(workDesc.pTriBuffer); + vY = _mm_load_ps(workDesc.pTriBuffer + 4); + vZ = _mm_load_ps(workDesc.pTriBuffer + 8); + vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); + + // triangle 0 + // v0,v1 -> v0,v0,v1 + __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); + __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); + + __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); + __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); + if (workDesc.triFlags.yMajor) { - RDTSC_EVENT(BEEmptyTriangle, 1, 0); - AR_END(BERasterizeTriangle, 1); - return; + vXa = _mm_add_ps(vAdjust, vXa); + } + else + { + vYa = _mm_add_ps(vAdjust, vYa); } - AR_BEGIN(BEStepSetup, pDC->drawId); + // Store triangle description for rasterizer + _mm_store_ps((float*)&newTriBuffer[0], vXa); + _mm_store_ps((float*)&newTriBuffer[4], vYa); + _mm_store_ps((float*)&newTriBuffer[8], vZa); + _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); - // Step to pixel center of top-left pixel of the triangle bbox - // Align intersect bbox (top/left) to raster tile's (top/left). - int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); - int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); - - // convenience typedef - typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT; - - // single sample rasterization evaluates edges at pixel center, - // multisample evaluates edges UL pixel corner and steps to each sample position - if(std::is_same::value) + // binner bins 3 edges for lines as v0, v1, v1 + // tri0 needs v0, v0, v1 + for (uint32_t a = 0; a < workDesc.numAttribs; ++a) { - // Add 0.5, in fixed point, to offset to pixel center - x += (FIXED_POINT_SCALE / 2); - y += (FIXED_POINT_SCALE / 2); + __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); + __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); + + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0); + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0); + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1); } - __m128i vTopLeftX = _mm_set1_epi32(x); - __m128i vTopLeftY = _mm_set1_epi32(y); - - // evaluate edge equations at top-left pixel using 64bit math - // - // line = Ax + By + C - // solving for C: - // C = -Ax - By - // we know x0 and y0 are on the line; plug them in: - // C = -Ax0 - By0 - // plug C back into line equation: - // line = Ax - By - Ax0 - By0 - // line = A(x - x0) + B(y - y0) - // dX = (x-x0), dY = (y-y0) - // so all this simplifies to - // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within - - __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); - __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); - - // evaluate A(dx) and B(dY) for all points - __m256d vAipd = _mm256_cvtepi32_pd(vAi); - __m256d vBipd = _mm256_cvtepi32_pd(vBi); - __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); - __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); - - __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); - __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); - __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); - - // apply any edge adjustments(top-left, crast, etc) - adjustEdgesFix16(vAi, vBi, vEdge); - - // broadcast respective edge results to all lanes - double* pEdge = (double*)&vEdge; - __m256d vEdgeFix16[7]; - vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); - vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); - vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); - - OSALIGNSIMD(int32_t) aAi[4], aBi[4]; - _mm_store_si128((__m128i*)aAi, vAi); - _mm_store_si128((__m128i*)aBi, vBi); - EDGE rastEdges[RT::NumEdgesT::value]; - - // Compute and store triangle edge data - ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); - ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); - ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); - - // Compute and store triangle edge data if scissor needs to rasterized - ComputeScissorEdges - (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); - - // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile - // used to for testing if entire raster tile is inside a triangle - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + // Store user clip distances for triangle 0 + float newClipBuffer[3 * 8]; + uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask); + if (numClipDist) { - vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); - } + newWorkDesc.pUserClipBuffer = newClipBuffer; - // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox - // step sample positions to the raster tile bbox of multisample points - // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) - // | | - // | | - // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) - __m256d vEdgeTileBbox[3]; - if (NumCoverageSamplesT::value > 1) - { - const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; - const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); - const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); - - __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); - __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); - - // step edge equation tests from Tile - // used to for testing if entire raster tile is inside a triangle - for (uint32_t e = 0; e < 3; ++e) + float* pOldBuffer = workDesc.pUserClipBuffer; + float* pNewBuffer = newClipBuffer; + for (uint32_t i = 0; i < numClipDist; ++i) { - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); - vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + // read barycentric coeffs from binner + float a = *(pOldBuffer++); + float b = *(pOldBuffer++); - // adjust for msaa tile bbox edges outward for conservative rast, if enabled - adjustEdgeConservative(vAi, vBi, vEdgeTileBbox[e]); + // reconstruct original clip distance at vertices + float c0 = a + b; + float c1 = b; + + // construct triangle barycentrics + *(pNewBuffer++) = c0 - c1; + *(pNewBuffer++) = c0 - c1; + *(pNewBuffer++) = c1; } } - AR_END(BEStepSetup, 0); + // setup triangle rasterizer function + PFN_WORK_FUNC pfnTriRast; + // conservative rast not supported for points/lines + pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, + SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false)); - uint32_t tY = minTileY; - uint32_t tX = minTileX; - uint32_t maxY = maxTileY; - uint32_t maxX = maxTileX; + // make sure this macrotile intersects the triangle + __m128i vXai = fpToFixedPoint(vXa); + __m128i vYai = fpToFixedPoint(vYa); + OSALIGNSIMD(SWR_RECT) bboxA; + calcBoundingBoxInt(vXai, vYai, bboxA); - RenderOutputBuffers renderBuffers, currentRenderBufferRow; - GetRenderHotTiles(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex); - currentRenderBufferRow = renderBuffers; + if (!(bboxA.xmin > macroBoxRight || + bboxA.xmin > scissorInFixedPoint.xmax || + bboxA.xmax - 1 < macroBoxLeft || + bboxA.xmax - 1 < scissorInFixedPoint.xmin || + bboxA.ymin > macroBoxBottom || + bboxA.ymin > scissorInFixedPoint.ymax || + bboxA.ymax - 1 < macroBoxTop || + bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { + // rasterize triangle + pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); + } - // rasterize and generate coverage masks per sample - for (uint32_t tileY = tY; tileY <= maxY; ++tileY) + // triangle 1 + // v0,v1 -> v1,v1,v0 + vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); + vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); + vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); + vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); + + vAdjust = _mm_mul_ps(vLineWidth, vBloat1); + if (workDesc.triFlags.yMajor) { - __m256d vStartOfRowEdge[RT::NumEdgesT::value]; - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + vXa = _mm_add_ps(vAdjust, vXa); + } + else + { + vYa = _mm_add_ps(vAdjust, vYa); + } + + // Store triangle description for rasterizer + _mm_store_ps((float*)&newTriBuffer[0], vXa); + _mm_store_ps((float*)&newTriBuffer[4], vYa); + _mm_store_ps((float*)&newTriBuffer[8], vZa); + _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); + + // binner bins 3 edges for lines as v0, v1, v1 + // tri1 needs v1, v1, v0 + for (uint32_t a = 0; a < workDesc.numAttribs; ++a) + { + __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); + __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); + + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1); + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1); + _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0); + } + + // store user clip distance for triangle 1 + if (numClipDist) + { + float* pOldBuffer = workDesc.pUserClipBuffer; + float* pNewBuffer = newClipBuffer; + for (uint32_t i = 0; i < numClipDist; ++i) { - vStartOfRowEdge[e] = vEdgeFix16[e]; + // read barycentric coeffs from binner + float a = *(pOldBuffer++); + float b = *(pOldBuffer++); + + // reconstruct original clip distance at vertices + float c0 = a + b; + float c1 = b; + + // construct triangle barycentrics + *(pNewBuffer++) = c1 - c0; + *(pNewBuffer++) = c1 - c0; + *(pNewBuffer++) = c0; } + } - for (uint32_t tileX = tX; tileX <= maxX; ++tileX) - { - triDesc.anyCoveredSamples = 0; + vXai = fpToFixedPoint(vXa); + vYai = fpToFixedPoint(vYa); + calcBoundingBoxInt(vXai, vYai, bboxA); - // is the corner of the edge outside of the raster tile? (vEdge < 0) - int mask0, mask1, mask2; - UpdateEdgeMasks(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2); + if (!(bboxA.xmin > macroBoxRight || + bboxA.xmin > scissorInFixedPoint.xmax || + bboxA.xmax - 1 < macroBoxLeft || + bboxA.xmax - 1 < scissorInFixedPoint.xmin || + bboxA.ymin > macroBoxBottom || + bboxA.ymin > scissorInFixedPoint.ymax || + bboxA.ymax - 1 < macroBoxTop || + bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { + // rasterize triangle + pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); + } - for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++) - { - // trivial reject, at least one edge has all 4 corners of raster tile outside - bool trivialReject = TrivialRejectTest(mask0, mask1, mask2); + AR_END(BERasterizeLine, 1); +} - if (!trivialReject) - { - // trivial accept mask - triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; - - // Update the raster tile edge masks based on inner conservative edge offsets, if enabled - UpdateEdgeMasksInnerConservative - (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); - - // @todo Make this a bit smarter to allow use of trivial accept when: - // 1) scissor/vp intersection rect is raster tile aligned - // 2) raster tile is entirely within scissor/vp intersection rect - if (TrivialAcceptTest(mask0, mask1, mask2)) - { - // trivial accept, all 4 corners of all 3 edges are negative - // i.e. raster tile completely inside triangle - triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; - if(std::is_same::value) - { - triDesc.innerCoverageMask = 0xffffffffffffffffULL; - } - RDTSC_EVENT(BETrivialAccept, 1, 0); - } - else - { - __m256d vEdgeAtSample[RT::NumEdgesT::value]; - if(std::is_same::value) - { - // should get optimized out for single sample case (global value numbering or copy propagation) - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vEdgeAtSample[e] = vEdgeFix16[e]; - } - } - else - { - const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; - __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); - __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); - __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); - __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); - - // step edge equation tests from UL tile corner to pixel sample position - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); - vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); - } - } - - double startQuadEdges[RT::NumEdgesT::value]; - const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); - } - - // not trivial accept or reject, must rasterize full tile - AR_BEGIN(BERasterizePartial, pDC->drawId); - triDesc.coverageMask[sampleNum] = rasterizePartialTile(pDC, startQuadEdges, rastEdges); - AR_END(BERasterizePartial, 0); - - triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; - - // Output SV InnerCoverage, if needed - GenerateSVInnerCoverage(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); - } - } - else - { - // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything - if(NumCoverageSamplesT::value > 1) - { - triDesc.coverageMask[sampleNum] = 0; - } - RDTSC_EVENT(BETrivialReject, 1, 0); - } - } +void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) +{ + SWR_CONTEXT *pContext = pDC->pContext; #if KNOB_ENABLE_TOSS_POINTS - if(KNOB_TOSS_RS) - { - gToss = triDesc.coverageMask[0]; - } - else -#endif - if(triDesc.anyCoveredSamples) - { - // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered - // copy conservative coverage result to all samples - if(RT::IsConservativeT::value) - { - auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; }; - UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage); - } - - AR_BEGIN(BEPixelBackend, pDC->drawId); - backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); - AR_END(BEPixelBackend, 0); - } - - // step to the next tile in X - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); - } - StepRasterTileX(state.psState.numRenderTargets, renderBuffers); - } - - // step to the next tile in Y - for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) - { - vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); - } - StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow); + if (KNOB_TOSS_BIN_TRIS) + { + return; } +#endif - AR_END(BERasterizeTriangle, 1); + const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + + // map x,y relative offsets from start of raster tile to bit position in + // coverage mask for the point + static const uint32_t coverageMap[8][8] = { + { 0, 1, 4, 5, 8, 9, 12, 13 }, + { 2, 3, 6, 7, 10, 11, 14, 15 }, + { 16, 17, 20, 21, 24, 25, 28, 29 }, + { 18, 19, 22, 23, 26, 27, 30, 31 }, + { 32, 33, 36, 37, 40, 41, 44, 45 }, + { 34, 35, 38, 39, 42, 43, 46, 47 }, + { 48, 49, 52, 53, 56, 57, 60, 61 }, + { 50, 51, 54, 55, 58, 59, 62, 63 } + }; + + OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; + + // pull point information from triangle buffer + // @todo use structs for readability + uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; + uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); + float z = *(workDesc.pTriBuffer + 2); + + // construct triangle descriptor for point + // no interpolation, set up i,j for constant interpolation of z and attribs + // @todo implement an optimized backend that doesn't require triangle information + + // compute coverage mask from x,y packed into the coverageMask flag + // mask indices by the maximum valid index for x/y of coveragemap. + uint32_t tX = workDesc.triFlags.coverageMask & 0x7; + uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; + // todo: multisample points? + triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX]; + + // no persp divide needed for points + triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; + triDesc.triFlags = workDesc.triFlags; + triDesc.recipDet = 1.0f; + triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; + triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; + triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; + triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; + + RenderOutputBuffers renderBuffers; + GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, + renderBuffers, triDesc.triFlags.renderTargetArrayIndex); + + AR_BEGIN(BEPixelBackend, pDC->drawId); + backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); + AR_END(BEPixelBackend, 0); } void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) @@ -1337,13 +357,13 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, *pBuf++ = upperY; pBuf++; _mm_store_ps(pBuf, _mm_set1_ps(z)); - _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f)); + _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f)); // setup triangle rasterizer function PFN_WORK_FUNC pfnTriRast; // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, - SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false)); + pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, + SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false)); // overwrite texcoords for point sprites if (isPointSpriteTexCoordEnabled) @@ -1421,383 +441,27 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); } -void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) +void InitRasterizerFunctions() { - SWR_CONTEXT *pContext = pDC->pContext; - -#if KNOB_ENABLE_TOSS_POINTS - if (KNOB_TOSS_BIN_TRIS) - { - return; - } -#endif - - const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; - - // map x,y relative offsets from start of raster tile to bit position in - // coverage mask for the point - static const uint32_t coverageMap[8][8] = { - { 0, 1, 4, 5, 8, 9, 12, 13 }, - { 2, 3, 6, 7, 10, 11, 14, 15 }, - { 16, 17, 20, 21, 24, 25, 28, 29 }, - { 18, 19, 22, 23, 26, 27, 30, 31 }, - { 32, 33, 36, 37, 40, 41, 44, 45 }, - { 34, 35, 38, 39, 42, 43, 46, 47 }, - { 48, 49, 52, 53, 56, 57, 60, 61 }, - { 50, 51, 54, 55, 58, 59, 62, 63 } - }; - - OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; - - // pull point information from triangle buffer - // @todo use structs for readability - uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; - uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); - float z = *(workDesc.pTriBuffer + 2); - - // construct triangle descriptor for point - // no interpolation, set up i,j for constant interpolation of z and attribs - // @todo implement an optimized backend that doesn't require triangle information - - // compute coverage mask from x,y packed into the coverageMask flag - // mask indices by the maximum valid index for x/y of coveragemap. - uint32_t tX = workDesc.triFlags.coverageMask & 0x7; - uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; - // todo: multisample points? - triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX]; - - // no persp divide needed for points - triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; - triDesc.triFlags = workDesc.triFlags; - triDesc.recipDet = 1.0f; - triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; - triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; - triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; - triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; - - RenderOutputBuffers renderBuffers; - GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, - renderBuffers, triDesc.triFlags.renderTargetArrayIndex); - - AR_BEGIN(BEPixelBackend, pDC->drawId); - backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); - AR_END(BEPixelBackend, 0); + InitRasterizerFuncs(); } -// Get pointers to hot tile memory for color RT, depth, stencil -template -void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex) -{ - const API_STATE& state = GetApiState(pDC); - SWR_CONTEXT *pContext = pDC->pContext; - - uint32_t mx, my; - MacroTileMgr::getTileIndices(macroID, mx, my); - tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; - tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; - - // compute tile offset for active hottile buffers - const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; - uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); - offset*=numSamples; - - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while(_BitScanForward(&rtSlot, colorHottileEnableMask)) - { - HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, - numSamples, renderTargetArrayIndex); - pColor->state = HOTTILE_DIRTY; - renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; - - colorHottileEnableMask &= ~(1 << rtSlot); - } - if(state.depthHottileEnable) - { - const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; - uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); - offset*=numSamples; - HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, - numSamples, renderTargetArrayIndex); - pDepth->state = HOTTILE_DIRTY; - SWR_ASSERT(pDepth->pBuffer != nullptr); - renderBuffers.pDepth = pDepth->pBuffer + offset; - } - if(state.stencilHottileEnable) - { - const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; - uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); - offset*=numSamples; - HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, - numSamples, renderTargetArrayIndex); - pStencil->state = HOTTILE_DIRTY; - SWR_ASSERT(pStencil->pBuffer != nullptr); - renderBuffers.pStencil = pStencil->pBuffer + offset; - } -} - -template -INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) -{ - for(uint32_t rt = 0; rt < NumRT; ++rt) - { - buffers.pColor[rt] += RT::colorRasterTileStep; - } - - buffers.pDepth += RT::depthRasterTileStep; - buffers.pStencil += RT::stencilRasterTileStep; -} - -template -INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) -{ - for(uint32_t rt = 0; rt < NumRT; ++rt) - { - startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; - buffers.pColor[rt] = startBufferRow.pColor[rt]; - } - startBufferRow.pDepth += RT::depthRasterTileRowStep; - buffers.pDepth = startBufferRow.pDepth; - - startBufferRow.pStencil += RT::stencilRasterTileRowStep; - buffers.pStencil = startBufferRow.pStencil; -} - -void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) -{ - SWR_CONTEXT *pContext = pDC->pContext; - const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); -#if KNOB_ENABLE_TOSS_POINTS - if (KNOB_TOSS_BIN_TRIS) - { - return; - } -#endif - - // bloat line to two tris and call the triangle rasterizer twice - AR_BEGIN(BERasterizeLine, pDC->drawId); - - const API_STATE &state = GetApiState(pDC); - const SWR_RASTSTATE &rastState = state.rastState; - - // macrotile dimensioning - uint32_t macroX, macroY; - MacroTileMgr::getTileIndices(macroTile, macroX, macroY); - int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; - int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; - int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; - int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - - const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; - - // create a copy of the triangle buffer to write our adjusted vertices to - OSALIGNSIMD(float) newTriBuffer[4 * 4]; - TRIANGLE_WORK_DESC newWorkDesc = workDesc; - newWorkDesc.pTriBuffer = &newTriBuffer[0]; - - // create a copy of the attrib buffer to write our adjusted attribs to - OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS]; - newWorkDesc.pAttribs = &newAttribBuffer[0]; - - const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f); - const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f); - - __m128 vX, vY, vZ, vRecipW; - - vX = _mm_load_ps(workDesc.pTriBuffer); - vY = _mm_load_ps(workDesc.pTriBuffer + 4); - vZ = _mm_load_ps(workDesc.pTriBuffer + 8); - vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); - - // triangle 0 - // v0,v1 -> v0,v0,v1 - __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); - __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); - - __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); - __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); - if (workDesc.triFlags.yMajor) - { - vXa = _mm_add_ps(vAdjust, vXa); - } - else - { - vYa = _mm_add_ps(vAdjust, vYa); - } - - // Store triangle description for rasterizer - _mm_store_ps((float*)&newTriBuffer[0], vXa); - _mm_store_ps((float*)&newTriBuffer[4], vYa); - _mm_store_ps((float*)&newTriBuffer[8], vZa); - _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); - - // binner bins 3 edges for lines as v0, v1, v1 - // tri0 needs v0, v0, v1 - for (uint32_t a = 0; a < workDesc.numAttribs; ++a) - { - __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]); - __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]); - - _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0); - _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0); - _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1); - } - - // Store user clip distances for triangle 0 - float newClipBuffer[3 * 8]; - uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask); - if (numClipDist) - { - newWorkDesc.pUserClipBuffer = newClipBuffer; - - float* pOldBuffer = workDesc.pUserClipBuffer; - float* pNewBuffer = newClipBuffer; - for (uint32_t i = 0; i < numClipDist; ++i) - { - // read barycentric coeffs from binner - float a = *(pOldBuffer++); - float b = *(pOldBuffer++); - - // reconstruct original clip distance at vertices - float c0 = a + b; - float c1 = b; - - // construct triangle barycentrics - *(pNewBuffer++) = c0 - c1; - *(pNewBuffer++) = c0 - c1; - *(pNewBuffer++) = c1; - } - } - - // setup triangle rasterizer function - PFN_WORK_FUNC pfnTriRast; - // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, - SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false)); - - // make sure this macrotile intersects the triangle - __m128i vXai = fpToFixedPoint(vXa); - __m128i vYai = fpToFixedPoint(vYa); - OSALIGNSIMD(SWR_RECT) bboxA; - calcBoundingBoxInt(vXai, vYai, bboxA); - - if (!(bboxA.xmin > macroBoxRight || - bboxA.xmin > scissorInFixedPoint.xmax || - bboxA.xmax - 1 < macroBoxLeft || - bboxA.xmax - 1 < scissorInFixedPoint.xmin || - bboxA.ymin > macroBoxBottom || - bboxA.ymin > scissorInFixedPoint.ymax || - bboxA.ymax - 1 < macroBoxTop || - bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { - // rasterize triangle - pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); - } - - // triangle 1 - // v0,v1 -> v1,v1,v0 - vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); - vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); - vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); - vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); - - vAdjust = _mm_mul_ps(vLineWidth, vBloat1); - if (workDesc.triFlags.yMajor) - { - vXa = _mm_add_ps(vAdjust, vXa); - } - else - { - vYa = _mm_add_ps(vAdjust, vYa); - } - - // Store triangle description for rasterizer - _mm_store_ps((float*)&newTriBuffer[0], vXa); - _mm_store_ps((float*)&newTriBuffer[4], vYa); - _mm_store_ps((float*)&newTriBuffer[8], vZa); - _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); - - // binner bins 3 edges for lines as v0, v1, v1 - // tri1 needs v1, v1, v0 - for (uint32_t a = 0; a < workDesc.numAttribs; ++a) - { - __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); - __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); - - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1); - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1); - _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0); - } - - // store user clip distance for triangle 1 - if (numClipDist) - { - float* pOldBuffer = workDesc.pUserClipBuffer; - float* pNewBuffer = newClipBuffer; - for (uint32_t i = 0; i < numClipDist; ++i) - { - // read barycentric coeffs from binner - float a = *(pOldBuffer++); - float b = *(pOldBuffer++); - - // reconstruct original clip distance at vertices - float c0 = a + b; - float c1 = b; - - // construct triangle barycentrics - *(pNewBuffer++) = c1 - c0; - *(pNewBuffer++) = c1 - c0; - *(pNewBuffer++) = c0; - } - } - - vXai = fpToFixedPoint(vXa); - vYai = fpToFixedPoint(vYa); - calcBoundingBoxInt(vXai, vYai, bboxA); - - if (!(bboxA.xmin > macroBoxRight || - bboxA.xmin > scissorInFixedPoint.xmax || - bboxA.xmax - 1 < macroBoxLeft || - bboxA.xmax - 1 < scissorInFixedPoint.xmin || - bboxA.ymin > macroBoxBottom || - bboxA.ymin > scissorInFixedPoint.ymax || - bboxA.ymax - 1 < macroBoxTop || - bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { - // rasterize triangle - pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); - } - - AR_END(BERasterizeLine, 1); -} - -struct RasterizerChooser -{ - typedef PFN_WORK_FUNC FuncType; - - template - static FuncType GetFunc() - { - return RasterizeTriangle>; - } -}; - // Selector for correct templated RasterizeTriangle function PFN_WORK_FUNC GetRasterizerFunc( - uint32_t numSamples, + SWR_MULTISAMPLE_COUNT numSamples, bool IsCenter, bool IsConservative, - uint32_t InputCoverage, + SWR_INPUT_COVERAGE InputCoverage, uint32_t EdgeEnable, bool RasterizeScissorEdges ) { - return TemplateArgUnroller::GetFunc( - IntArg{numSamples}, - IsCenter, - IsConservative, - IntArg{InputCoverage}, - IntArg<0, STATE_VALID_TRI_EDGE_COUNT-1>{EdgeEnable}, - RasterizeScissorEdges); + SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT); + SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT); + SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT); + + PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges]; + SWR_ASSERT(func); + + return func; } diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h index e99920aef2e..414d0f07819 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h @@ -35,6 +35,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void InitRasterizerFunctions(); INLINE __m128i fpToFixedPoint(const __m128 vIn) @@ -43,15 +44,6 @@ __m128i fpToFixedPoint(const __m128 vIn) return _mm_cvtps_epi32(vFixed); } -// Selector for correct templated RasterizeTriangle function -PFN_WORK_FUNC GetRasterizerFunc( - uint32_t numSamples, - bool IsCenter, - bool IsConservative, - uint32_t InputCoverage, - uint32_t EdgeEnable, - bool RasterizeScissorEdges); - enum TriEdgesStates { STATE_NO_VALID_EDGES = 0, @@ -72,6 +64,15 @@ enum TriEdgesValues VALID_TRI_EDGE_COUNT, }; +// Selector for correct templated RasterizeTriangle function +PFN_WORK_FUNC GetRasterizerFunc( + SWR_MULTISAMPLE_COUNT numSamples, + bool IsCenter, + bool IsConservative, + SWR_INPUT_COVERAGE InputCoverage, + uint32_t EdgeEnable, + bool RasterizeScissorEdges); + ////////////////////////////////////////////////////////////////////////// /// @brief ValidTriEdges convenience typedefs used for templated function /// specialization supported Fixed Point precisions @@ -173,7 +174,7 @@ struct RasterEdgeTraits /// (only used with conservative rasterization) /// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor? template -struct RasterizerTraits final : public ConservativeRastBETraits, +struct _RasterizerTraits : public ConservativeRastBETraits, public RasterEdgeTraits { typedef MultisampleTraits(NumSamplesT::value), CenterPatternT::value> MT; @@ -197,3 +198,13 @@ struct RasterizerTraits final : public ConservativeRastBETraits +struct RasterizerTraits final : public _RasterizerTraits < + std::integral_constant, + std::integral_constant, + std::integral_constant, + std::integral_constant, + std::integral_constant, + std::integral_constant > +{}; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h new file mode 100644 index 00000000000..b73a99b4540 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h @@ -0,0 +1,1376 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file rasterizer.cpp +* +* @brief Implementation for the rasterizer. +* +******************************************************************************/ + +#include +#include + +#include "rasterizer.h" +#include "rdtsc_core.h" +#include "backend.h" +#include "utils.h" +#include "frontend.h" +#include "tilemgr.h" +#include "memory/tilingtraits.h" + +extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2]; + +template +void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); +template +void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers); +template +void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); + +#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} +static const __m256d gMaskToVecpd[] = +{ + MASKTOVEC(0, 0, 0, 0), + MASKTOVEC(0, 0, 0, 1), + MASKTOVEC(0, 0, 1, 0), + MASKTOVEC(0, 0, 1, 1), + MASKTOVEC(0, 1, 0, 0), + MASKTOVEC(0, 1, 0, 1), + MASKTOVEC(0, 1, 1, 0), + MASKTOVEC(0, 1, 1, 1), + MASKTOVEC(1, 0, 0, 0), + MASKTOVEC(1, 0, 0, 1), + MASKTOVEC(1, 0, 1, 0), + MASKTOVEC(1, 0, 1, 1), + MASKTOVEC(1, 1, 0, 0), + MASKTOVEC(1, 1, 0, 1), + MASKTOVEC(1, 1, 1, 0), + MASKTOVEC(1, 1, 1, 1), +}; + +struct POS +{ + int32_t x, y; +}; + +struct EDGE +{ + double a, b; // a, b edge coefficients in fix8 + double stepQuadX; // step to adjacent horizontal quad in fix16 + double stepQuadY; // step to adjacent vertical quad in fix16 + double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 + double stepRasterTileY; // step to adjacent vertical raster tile in fix16 + + __m256d vQuadOffsets; // offsets for 4 samples of a quad + __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief rasterize a raster tile partially covered by the triangle +/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile +/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) +/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. +/// Used to step between quads when sweeping over the raster tile. +template +INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges) +{ + uint64_t coverageMask = 0; + + __m256d vEdges[NumEdges]; + __m256d vStepX[NumEdges]; + __m256d vStepY[NumEdges]; + + for (uint32_t e = 0; e < NumEdges; ++e) + { + // Step to the pixel sample locations of the 1st quad + vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets); + + // compute step to next quad (mul by 2 in x and y direction) + vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX); + vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY); + } + + // fast unrolled version for 8x8 tile +#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 + int edgeMask[NumEdges]; + uint64_t mask; + + auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);}; + auto update_lambda = [&](int e){mask &= edgeMask[e];}; + auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);}; + auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);}; + auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);}; + +// evaluate which pixels in the quad are covered +#define EVAL \ + UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); + + // update coverage mask + // if edge 0 is degenerate and will be skipped; init the mask +#define UPDATE_MASK(bit) \ + if(std::is_same::value || std::is_same::value){\ + mask = 0xf;\ + }\ + else{\ + mask = edgeMask[0]; \ + }\ + UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ + coverageMask |= (mask << bit); + + // step in the +x direction to the next quad +#define INCX \ + UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); + + // step in the +y direction to the next quad +#define INCY \ + UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); + + // step in the -x direction to the next quad +#define DECX \ + UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); + + // sweep 2x2 quad back and forth through the raster tile, + // computing coverage masks for the entire tile + + // raster tile + // 0 1 2 3 4 5 6 7 + // x x + // x x ------------------> + // x x | + // <-----------------x x V + // .. + + // row 0 + EVAL; + UPDATE_MASK(0); + INCX; + EVAL; + UPDATE_MASK(4); + INCX; + EVAL; + UPDATE_MASK(8); + INCX; + EVAL; + UPDATE_MASK(12); + INCY; + + //row 1 + EVAL; + UPDATE_MASK(28); + DECX; + EVAL; + UPDATE_MASK(24); + DECX; + EVAL; + UPDATE_MASK(20); + DECX; + EVAL; + UPDATE_MASK(16); + INCY; + + // row 2 + EVAL; + UPDATE_MASK(32); + INCX; + EVAL; + UPDATE_MASK(36); + INCX; + EVAL; + UPDATE_MASK(40); + INCX; + EVAL; + UPDATE_MASK(44); + INCY; + + // row 3 + EVAL; + UPDATE_MASK(60); + DECX; + EVAL; + UPDATE_MASK(56); + DECX; + EVAL; + UPDATE_MASK(52); + DECX; + EVAL; + UPDATE_MASK(48); +#else + uint32_t bit = 0; + for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) + { + __m256d vStartOfRowEdge[NumEdges]; + for (uint32_t e = 0; e < NumEdges; ++e) + { + vStartOfRowEdge[e] = vEdges[e]; + } + + for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) + { + int edgeMask[NumEdges]; + for (uint32_t e = 0; e < NumEdges; ++e) + { + edgeMask[e] = _mm256_movemask_pd(vEdges[e]); + } + + uint64_t mask = edgeMask[0]; + for (uint32_t e = 1; e < NumEdges; ++e) + { + mask &= edgeMask[e]; + } + coverageMask |= (mask << bit); + + // step to the next pixel in the x + for (uint32_t e = 0; e < NumEdges; ++e) + { + vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); + } + bit+=4; + } + + // step to the next row + for (uint32_t e = 0; e < NumEdges; ++e) + { + vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]); + } + } +#endif + return coverageMask; + +} +// Top left rule: +// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge +// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge +// Top left: a sample is in if it is a top or left edge. +// Out: !(horizontal && above) = !horizontal && below +// Out: !horizontal && left = !(!horizontal && left) = horizontal and right +INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) +{ + // if vA < 0, vC-- + // if vA == 0 && vB < 0, vC-- + + __m256d vEdgeOut = vEdge; + __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); + + // if vA < 0 (line is not horizontal and below) + int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); + + // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) + __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); + int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); + msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); + + // if either of these are true and we're on the line (edge == 0), bump it outside the line + vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief calculates difference in precision between the result of manh +/// calculation and the edge precision, based on compile time trait values +template +constexpr int64_t ManhToEdgePrecisionAdjust() +{ + static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, + "Inadequate precision of result of manh calculation "); + return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value); +} + +////////////////////////////////////////////////////////////////////////// +/// @struct adjustEdgeConservative +/// @brief Primary template definition used for partially specializing +/// the adjustEdgeConservative function. This struct should never +/// be instantiated. +/// @tparam RT: rasterizer traits +/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting? +template +struct adjustEdgeConservative +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs calculations to adjust each edge of a triangle away + /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y + /// direction. + /// + /// Uncertainty regions arise from fixed point rounding, which + /// can snap a vertex +/- by min fixed point value. + /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. + /// This allows the rasterizer to test for coverage only at the pixel center, + /// instead of having to test individual pixel corners for conservative coverage + INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + { + // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away + // from the pixel center (in the direction of the edge normal A/B) + + // edge = Ax + Bx + C - (manh/e) + // manh = manhattan distance = abs(A) + abs(B) + // e = absolute rounding error from snapping from float to fixed point precision + + // 'fixed point' multiply (in double to be avx1 friendly) + // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example + __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); + __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), + _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); + + static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, + "Inadequate precision of result of manh calculation "); + + // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision + // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right + manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust() * 0.5)); + + // move the edge away from the pixel center by the required conservative precision + 1/2 pixel + // this allows the rasterizer to do a single conservative coverage test to see if the primitive + // intersects the pixel at all + vEdge = _mm256_sub_pd(vEdge, manh); + }; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief adjustEdgeConservative specialization where no edge offset is needed +template +struct adjustEdgeConservative> +{ + INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief calculates the distance a degenerate BBox needs to be adjusted +/// for conservative rast based on compile time trait values +template +constexpr int64_t ConservativeScissorOffset() +{ + static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision"); + // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges + typedef std::integral_constant DegenerateEdgeOffsetT; + // 1/2 pixel edge offset + conservative offset - degenerateTriangle + return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Performs calculations to adjust each a vector of evaluated edges out +/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y +/// direction. +template +INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge) +{ + int64_t aabs = std::abs(static_cast(a)), babs = std::abs(static_cast(b)); + int64_t manh = ((aabs * ConservativeScissorOffset()) + (babs * ConservativeScissorOffset())) >> ManhToEdgePrecisionAdjust(); + vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Performs calculations to adjust each a scalar evaluated edge out +/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y +/// direction. +template +INLINE double adjustScalarEdge(const double a, const double b, const double Edge) +{ + int64_t aabs = std::abs(static_cast(a)), babs = std::abs(static_cast(b)); + int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust(); + return (Edge - manh); +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Perform any needed adjustments to evaluated triangle edges +template +struct adjustEdgesFix16 +{ + INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + { + static_assert(std::is_same>::value, + "Edge equation expected to be in x.16 fixed point"); + + static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled"); + + // need to apply any edge offsets before applying the top-left rule + adjustEdgeConservative(vAi, vBi, vEdge); + + adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Perform top left adjustments to evaluated triangle edges +template +struct adjustEdgesFix16> +{ + INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + { + adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); + } +}; + +// max(abs(dz/dx), abs(dz,dy) +INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) +{ + /* + // evaluate i,j at (0,0) + float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; + float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; + + // evaluate i,j at (1,0) + float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; + float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; + + // compute dz/dx + float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; + float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; + float dzdx = abs(d10 - d00); + + // evaluate i,j at (0,1) + float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; + float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; + + float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; + float dzdy = abs(d01 - d00); + */ + + // optimized version of above + float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); + float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); + + return std::max(dzdx, dzdy); +} + +INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) +{ + if (pState->depthFormat == R24_UNORM_X8_TYPELESS) + { + return (1.0f / (1 << 24)); + } + else if (pState->depthFormat == R16_UNORM) + { + return (1.0f / (1 << 16)); + } + else + { + SWR_ASSERT(pState->depthFormat == R32_FLOAT); + + // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) + float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); + uint32_t zMaxInt = *(uint32_t*)&zMax; + zMaxInt &= 0x7f800000; + zMax = *(float*)&zMaxInt; + + return zMax * (1.0f / (1 << 23)); + } +} + +INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) +{ + if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) + { + return 0.0f; + } + + float scale = pState->slopeScaledDepthBias; + if (scale != 0.0f) + { + scale *= ComputeMaxDepthSlope(pTri); + } + + float bias = pState->depthBias; + if (!pState->depthBiasPreAdjusted) + { + bias *= ComputeBiasFactor(pState, pTri, z); + } + bias += scale; + + if (pState->depthBiasClamp > 0.0f) + { + bias = std::min(bias, pState->depthBiasClamp); + } + else if (pState->depthBiasClamp < 0.0f) + { + bias = std::max(bias, pState->depthBiasClamp); + } + + return bias; +} + +// Prevent DCE by writing coverage mask from rasterizer to volatile +#if KNOB_ENABLE_TOSS_POINTS +__declspec(thread) volatile uint64_t gToss; +#endif + +static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; +// try to avoid _chkstk insertions; make this thread local +static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib]; + +INLINE +void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) +{ + edge.a = a; + edge.b = b; + + // compute constant steps to adjacent quads + edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE)); + edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE)); + + // compute constant steps to adjacent raster tiles + edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE)); + edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE)); + + // compute quad offsets + const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); + const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); + + __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); + __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); + edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); + + // compute raster tile offsets + const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0); + const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0); + + __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); + __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); + edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16); +} + +INLINE +void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) +{ + ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Primary template definition used for partially specializing +/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel +/// corner to sample position, and test for coverage +/// @tparam sampleCount: multisample count +template +INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, + int32_t &mask0, int32_t &mask1, int32_t &mask2) +{ + __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; + // evaluate edge equations at the tile multisample bounding box + vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); + vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); + vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); + mask0 = _mm256_movemask_pd(vSampleBboxTest0); + mask1 = _mm256_movemask_pd(vSampleBboxTest1); + mask2 = _mm256_movemask_pd(vSampleBboxTest2); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief UpdateEdgeMasks specialization, instantiated +/// when only rasterizing a single coverage test point +template <> +INLINE void UpdateEdgeMasks(const __m256d(&)[3], const __m256d* vEdgeFix16, + int32_t &mask0, int32_t &mask1, int32_t &mask2) +{ + mask0 = _mm256_movemask_pd(vEdgeFix16[0]); + mask1 = _mm256_movemask_pd(vEdgeFix16[1]); + mask2 = _mm256_movemask_pd(vEdgeFix16[2]); +} + +////////////////////////////////////////////////////////////////////////// +/// @struct ComputeScissorEdges +/// @brief Primary template definition. Allows the function to be generically +/// called. When paired with below specializations, will result in an empty +/// inlined function if scissor is not enabled +/// @tparam RasterScissorEdgesT: is scissor enabled? +/// @tparam IsConservativeT: is conservative rast enabled? +/// @tparam RT: rasterizer traits +template +struct ComputeScissorEdges +{ + INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief ComputeScissorEdges partial +/// specialization. Instantiated when conservative rast and scissor are enabled +template +struct ComputeScissorEdges +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, + /// evaluate edge equations and offset them away from pixel center. + INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) + { + // if conservative rasterizing, triangle bbox intersected with scissor bbox is used + SWR_RECT scissor; + scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin); + scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax); + scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin); + scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax); + + POS topLeft{scissor.xmin, scissor.ymin}; + POS bottomLeft{scissor.xmin, scissor.ymax}; + POS topRight{scissor.xmax, scissor.ymin}; + POS bottomRight{scissor.xmax, scissor.ymax}; + + // construct 4 scissor edges in ccw direction + ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); + ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); + ComputeEdgeData(bottomRight, topRight, rastEdges[5]); + ComputeEdgeData(topRight, topLeft, rastEdges[6]); + + vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); + vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); + vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); + vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); + + // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing + adjustScissorEdge(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); + adjustScissorEdge(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); + adjustScissorEdge(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); + adjustScissorEdge(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]); + + // Upper left rule for scissor + vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); + vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief ComputeScissorEdges partial +/// specialization. Instantiated when scissor is enabled and conservative rast +/// is disabled. +template +struct ComputeScissorEdges +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Compute scissor edge vectors and evaluate edge equations + INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) + { + const SWR_RECT &scissor = scissorBBox; + POS topLeft{scissor.xmin, scissor.ymin}; + POS bottomLeft{scissor.xmin, scissor.ymax}; + POS topRight{scissor.xmax, scissor.ymin}; + POS bottomRight{scissor.xmax, scissor.ymax}; + + // construct 4 scissor edges in ccw direction + ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); + ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); + ComputeEdgeData(bottomRight, topRight, rastEdges[5]); + ComputeEdgeData(topRight, topLeft, rastEdges[6]); + + vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); + vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); + vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); + vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); + + // Upper left rule for scissor + vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); + vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Primary function template for TrivialRejectTest. Should +/// never be called, but TemplateUnroller instantiates a few unused values, +/// so it calls a runtime assert instead of a static_assert. +template +INLINE bool TrivialRejectTest(const int, const int, const int) +{ + SWR_INVALID("Primary templated function should never be called"); + return false; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0 +/// and edge 1 for trivial coverage reject +template <> +INLINE bool TrivialRejectTest(const int mask0, const int mask1, const int) +{ + return (!(mask0 && mask1)) ? true : false; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0 +/// and edge 2 for trivial coverage reject +template <> +INLINE bool TrivialRejectTest(const int mask0, const int, const int mask2) +{ + return (!(mask0 && mask2)) ? true : false; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1 +/// and edge 2 for trivial coverage reject +template <> +INLINE bool TrivialRejectTest(const int, const int mask1, const int mask2) +{ + return (!(mask1 && mask2)) ? true : false; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all +/// primitive edges for trivial coverage reject +template <> +INLINE bool TrivialRejectTest(const int mask0, const int mask1, const int mask2) +{ + return (!(mask0 && mask1 && mask2)) ? true : false;; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate +/// point, so return false and rasterize against conservative BBox +template <> +INLINE bool TrivialRejectTest(const int, const int, const int) +{ + return false; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Primary function template for TrivialAcceptTest. Always returns +/// false, since it will only be called for degenerate tris, and as such +/// will never cover the entire raster tile +template +INLINE bool TrivialAcceptTest(const int, const int, const int) +{ + return false; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all +/// edge masks for a fully covered raster tile +template <> +INLINE bool TrivialAcceptTest(const int mask0, const int mask1, const int mask2) +{ + return ((mask0 & mask1 & mask2) == 0xf); +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Primary function template for GenerateSVInnerCoverage. Results +/// in an empty function call if SVInnerCoverage isn't requested +template +struct GenerateSVInnerCoverage +{ + INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Specialization of GenerateSVInnerCoverage where all edges +/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated +/// edge values from OuterConservative to InnerConservative and rasterizes. +template +struct GenerateSVInnerCoverage +{ + INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask) + { + SWR_CONTEXT *pContext = pDC->pContext; + + double startQuadEdgesAdj[RT::NumEdgesT::value]; + for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + startQuadEdgesAdj[e] = adjustScalarEdge(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); + } + + // not trivial accept or reject, must rasterize full tile + AR_BEGIN(BERasterizePartial, pDC->drawId); + innerCoverageMask = rasterizePartialTile(pDC, startQuadEdgesAdj, pRastEdges); + AR_END(BERasterizePartial, 0); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results +/// in an empty function call if SVInnerCoverage isn't requested +template +struct UpdateEdgeMasksInnerConservative +{ + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*, + const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges +/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges +/// evaluated at raster tile corners to inner conservative position and +/// updates edge masks +template +struct UpdateEdgeMasksInnerConservative +{ + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, + const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2) + { + __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; + + // instead of keeping 2 copies of evaluated edges around, just compensate for the outer + // conservative evaluated edge when adjusting the edge in for inner conservative tests + adjustEdgeConservative(vAi, vBi, vTempEdge[0]); + adjustEdgeConservative(vAi, vBi, vTempEdge[1]); + adjustEdgeConservative(vAi, vBi, vTempEdge[2]); + + UpdateEdgeMasks(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage +/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot +/// cover an entire raster tile, set mask0 to 0 to force it down the +/// rastierizePartialTile path +template +struct UpdateEdgeMasksInnerConservative +{ + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*, + const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &) + { + // set one mask to zero to force the triangle down the rastierizePartialTile path + mask0 = 0; + } +}; + +template +void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) +{ + SWR_CONTEXT *pContext = pDC->pContext; + const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); +#if KNOB_ENABLE_TOSS_POINTS + if (KNOB_TOSS_BIN_TRIS) + { + return; + } +#endif + AR_BEGIN(BERasterizeTriangle, pDC->drawId); + AR_BEGIN(BETriangleSetup, pDC->drawId); + + const API_STATE &state = GetApiState(pDC); + const SWR_RASTSTATE &rastState = state.rastState; + const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + + OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; + triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; + + __m128 vX, vY, vZ, vRecipW; + + // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care + // eg: vX = [x0 x1 x2 dc] + vX = _mm_load_ps(workDesc.pTriBuffer); + vY = _mm_load_ps(workDesc.pTriBuffer + 4); + vZ = _mm_load_ps(workDesc.pTriBuffer + 8); + vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); + + // convert to fixed point + static_assert(std::is_same>::value, "Rasterizer expects 16.8 fixed point precision"); + __m128i vXi = fpToFixedPoint(vX); + __m128i vYi = fpToFixedPoint(vY); + + // quantize floating point position to fixed point precision + // to prevent attribute creep around the triangle vertices + vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); + vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); + + // triangle setup - A and B edge equation coefs + __m128 vA, vB; + triangleSetupAB(vX, vY, vA, vB); + + __m128i vAi, vBi; + triangleSetupABInt(vXi, vYi, vAi, vBi); + + // determinant + float det = calcDeterminantInt(vAi, vBi); + + // Verts in Pixel Coordinate Space at this point + // Det > 0 = CW winding order + // Convert CW triangles to CCW + if (det > 0.0) + { + vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); + vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); + vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); + vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); + det = -det; + } + + __m128 vC; + // Finish triangle setup - C edge coef + triangleSetupC(vX, vY, vA, vB, vC); + + if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) + { + // If we have degenerate edge(s) to rasterize, set I and J coefs + // to 0 for constant interpolation of attributes + triDesc.I[0] = 0.0f; + triDesc.I[1] = 0.0f; + triDesc.I[2] = 0.0f; + triDesc.J[0] = 0.0f; + triDesc.J[1] = 0.0f; + triDesc.J[2] = 0.0f; + + // Degenerate triangles have no area + triDesc.recipDet = 0.0f; + } + else + { + // only extract coefs for 2 of the barycentrics; the 3rd can be + // determined from the barycentric equation: + // i + j + k = 1 <=> k = 1 - j - i + _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); + _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); + _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); + _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); + _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); + _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); + + // compute recipDet, used to calculate barycentric i and j in the backend + triDesc.recipDet = 1.0f/det; + } + + OSALIGNSIMD(float) oneOverW[4]; + _mm_store_ps(oneOverW, vRecipW); + triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; + triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; + triDesc.OneOverW[2] = oneOverW[2]; + + // calculate perspective correct coefs per vertex attrib + float* pPerspAttribs = perspAttribsTLS; + float* pAttribs = workDesc.pAttribs; + triDesc.pPerspAttribs = pPerspAttribs; + triDesc.pAttribs = pAttribs; + float *pRecipW = workDesc.pTriBuffer + 12; + triDesc.pRecipW = pRecipW; + __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); + __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); + __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); + for(uint32_t i = 0; i < workDesc.numAttribs; i++) + { + __m128 attribA = _mm_load_ps(pAttribs); + __m128 attribB = _mm_load_ps(pAttribs+=4); + __m128 attribC = _mm_load_ps(pAttribs+=4); + pAttribs+=4; + + attribA = _mm_mul_ps(attribA, vOneOverWV0); + attribB = _mm_mul_ps(attribB, vOneOverWV1); + attribC = _mm_mul_ps(attribC, vOneOverWV2); + + _mm_store_ps(pPerspAttribs, attribA); + _mm_store_ps(pPerspAttribs+=4, attribB); + _mm_store_ps(pPerspAttribs+=4, attribC); + pPerspAttribs+=4; + } + + // compute bary Z + // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) + OSALIGNSIMD(float) a[4]; + _mm_store_ps(a, vZ); + triDesc.Z[0] = a[0] - a[2]; + triDesc.Z[1] = a[1] - a[2]; + triDesc.Z[2] = a[2]; + + // add depth bias + triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); + + // Calc bounding box of triangle + OSALIGNSIMD(SWR_RECT) bbox; + calcBoundingBoxInt(vXi, vYi, bbox); + + const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; + + if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) + { + // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid + bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++; + SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, + "Conservative rast degenerate handling requires a valid scissor rect"); + } + + // Intersect with scissor/viewport + OSALIGNSIMD(SWR_RECT) intersect; + intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); + intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); + intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); + intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); + + triDesc.triFlags = workDesc.triFlags; + + // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox + uint32_t macroX, macroY; + MacroTileMgr::getTileIndices(macroTile, macroX, macroY); + int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; + int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; + int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; + int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; + + intersect.xmin = std::max(intersect.xmin, macroBoxLeft); + intersect.ymin = std::max(intersect.ymin, macroBoxTop); + intersect.xmax = std::min(intersect.xmax, macroBoxRight); + intersect.ymax = std::min(intersect.ymax, macroBoxBottom); + + SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0); + + AR_END(BETriangleSetup, 0); + + // update triangle desc + uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + uint32_t numTilesX = maxTileX - minTileX + 1; + uint32_t numTilesY = maxTileY - minTileY + 1; + + if (numTilesX == 0 || numTilesY == 0) + { + RDTSC_EVENT(BEEmptyTriangle, 1, 0); + AR_END(BERasterizeTriangle, 1); + return; + } + + AR_BEGIN(BEStepSetup, pDC->drawId); + + // Step to pixel center of top-left pixel of the triangle bbox + // Align intersect bbox (top/left) to raster tile's (top/left). + int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); + int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); + + // convenience typedef + typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT; + + // single sample rasterization evaluates edges at pixel center, + // multisample evaluates edges UL pixel corner and steps to each sample position + if(std::is_same::value) + { + // Add 0.5, in fixed point, to offset to pixel center + x += (FIXED_POINT_SCALE / 2); + y += (FIXED_POINT_SCALE / 2); + } + + __m128i vTopLeftX = _mm_set1_epi32(x); + __m128i vTopLeftY = _mm_set1_epi32(y); + + // evaluate edge equations at top-left pixel using 64bit math + // + // line = Ax + By + C + // solving for C: + // C = -Ax - By + // we know x0 and y0 are on the line; plug them in: + // C = -Ax0 - By0 + // plug C back into line equation: + // line = Ax - By - Ax0 - By0 + // line = A(x - x0) + B(y - y0) + // dX = (x-x0), dY = (y-y0) + // so all this simplifies to + // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within + + __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); + __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); + + // evaluate A(dx) and B(dY) for all points + __m256d vAipd = _mm256_cvtepi32_pd(vAi); + __m256d vBipd = _mm256_cvtepi32_pd(vBi); + __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); + __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); + + __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); + __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); + __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); + + // apply any edge adjustments(top-left, crast, etc) + adjustEdgesFix16(vAi, vBi, vEdge); + + // broadcast respective edge results to all lanes + double* pEdge = (double*)&vEdge; + __m256d vEdgeFix16[7]; + vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); + vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); + vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); + + OSALIGNSIMD(int32_t) aAi[4], aBi[4]; + _mm_store_si128((__m128i*)aAi, vAi); + _mm_store_si128((__m128i*)aBi, vBi); + EDGE rastEdges[RT::NumEdgesT::value]; + + // Compute and store triangle edge data + ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); + ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); + ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); + + // Compute and store triangle edge data if scissor needs to rasterized + ComputeScissorEdges + (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); + + // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile + // used to for testing if entire raster tile is inside a triangle + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); + } + + // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox + // step sample positions to the raster tile bbox of multisample points + // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) + // | | + // | | + // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) + __m256d vEdgeTileBbox[3]; + if (NumCoverageSamplesT::value > 1) + { + const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; + const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); + const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); + + __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); + __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); + + // step edge equation tests from Tile + // used to for testing if entire raster tile is inside a triangle + for (uint32_t e = 0; e < 3; ++e) + { + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); + vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + + // adjust for msaa tile bbox edges outward for conservative rast, if enabled + adjustEdgeConservative(vAi, vBi, vEdgeTileBbox[e]); + } + } + + AR_END(BEStepSetup, 0); + + uint32_t tY = minTileY; + uint32_t tX = minTileX; + uint32_t maxY = maxTileY; + uint32_t maxX = maxTileX; + + RenderOutputBuffers renderBuffers, currentRenderBufferRow; + GetRenderHotTiles(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex); + currentRenderBufferRow = renderBuffers; + + // rasterize and generate coverage masks per sample + for (uint32_t tileY = tY; tileY <= maxY; ++tileY) + { + __m256d vStartOfRowEdge[RT::NumEdgesT::value]; + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + vStartOfRowEdge[e] = vEdgeFix16[e]; + } + + for (uint32_t tileX = tX; tileX <= maxX; ++tileX) + { + triDesc.anyCoveredSamples = 0; + + // is the corner of the edge outside of the raster tile? (vEdge < 0) + int mask0, mask1, mask2; + UpdateEdgeMasks(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2); + + for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++) + { + // trivial reject, at least one edge has all 4 corners of raster tile outside + bool trivialReject = TrivialRejectTest(mask0, mask1, mask2); + + if (!trivialReject) + { + // trivial accept mask + triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; + + // Update the raster tile edge masks based on inner conservative edge offsets, if enabled + UpdateEdgeMasksInnerConservative + (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); + + // @todo Make this a bit smarter to allow use of trivial accept when: + // 1) scissor/vp intersection rect is raster tile aligned + // 2) raster tile is entirely within scissor/vp intersection rect + if (TrivialAcceptTest(mask0, mask1, mask2)) + { + // trivial accept, all 4 corners of all 3 edges are negative + // i.e. raster tile completely inside triangle + triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; + if(std::is_same::value) + { + triDesc.innerCoverageMask = 0xffffffffffffffffULL; + } + RDTSC_EVENT(BETrivialAccept, 1, 0); + } + else + { + __m256d vEdgeAtSample[RT::NumEdgesT::value]; + if(std::is_same::value) + { + // should get optimized out for single sample case (global value numbering or copy propagation) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + vEdgeAtSample[e] = vEdgeFix16[e]; + } + } + else + { + const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; + __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); + __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); + __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); + __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); + + // step edge equation tests from UL tile corner to pixel sample position + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); + vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); + } + } + + double startQuadEdges[RT::NumEdgesT::value]; + const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); + } + + // not trivial accept or reject, must rasterize full tile + AR_BEGIN(BERasterizePartial, pDC->drawId); + triDesc.coverageMask[sampleNum] = rasterizePartialTile(pDC, startQuadEdges, rastEdges); + AR_END(BERasterizePartial, 0); + + triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; + + // Output SV InnerCoverage, if needed + GenerateSVInnerCoverage(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); + } + } + else + { + // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything + if(NumCoverageSamplesT::value > 1) + { + triDesc.coverageMask[sampleNum] = 0; + } + RDTSC_EVENT(BETrivialReject, 1, 0); + } + } + +#if KNOB_ENABLE_TOSS_POINTS + if(KNOB_TOSS_RS) + { + gToss = triDesc.coverageMask[0]; + } + else +#endif + if(triDesc.anyCoveredSamples) + { + // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered + // copy conservative coverage result to all samples + if(RT::IsConservativeT::value) + { + auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; }; + UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage); + } + + AR_BEGIN(BEPixelBackend, pDC->drawId); + backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); + AR_END(BEPixelBackend, 0); + } + + // step to the next tile in X + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); + } + StepRasterTileX(state.psState.numRenderTargets, renderBuffers); + } + + // step to the next tile in Y + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); + } + StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow); + } + + AR_END(BERasterizeTriangle, 1); +} + +// Get pointers to hot tile memory for color RT, depth, stencil +template +void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex) +{ + const API_STATE& state = GetApiState(pDC); + SWR_CONTEXT *pContext = pDC->pContext; + + uint32_t mx, my; + MacroTileMgr::getTileIndices(macroID, mx, my); + tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; + tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; + + // compute tile offset for active hottile buffers + const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; + uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); + offset*=numSamples; + + unsigned long rtSlot = 0; + uint32_t colorHottileEnableMask = state.colorHottileEnable; + while(_BitScanForward(&rtSlot, colorHottileEnableMask)) + { + HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, + numSamples, renderTargetArrayIndex); + pColor->state = HOTTILE_DIRTY; + renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; + + colorHottileEnableMask &= ~(1 << rtSlot); + } + if(state.depthHottileEnable) + { + const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; + uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); + offset*=numSamples; + HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, + numSamples, renderTargetArrayIndex); + pDepth->state = HOTTILE_DIRTY; + SWR_ASSERT(pDepth->pBuffer != nullptr); + renderBuffers.pDepth = pDepth->pBuffer + offset; + } + if(state.stencilHottileEnable) + { + const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; + uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); + offset*=numSamples; + HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, + numSamples, renderTargetArrayIndex); + pStencil->state = HOTTILE_DIRTY; + SWR_ASSERT(pStencil->pBuffer != nullptr); + renderBuffers.pStencil = pStencil->pBuffer + offset; + } +} + +template +INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) +{ + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + buffers.pColor[rt] += RT::colorRasterTileStep; + } + + buffers.pDepth += RT::depthRasterTileStep; + buffers.pStencil += RT::stencilRasterTileStep; +} + +template +INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) +{ + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; + buffers.pColor[rt] = startBufferRow.pColor[rt]; + } + startBufferRow.pDepth += RT::depthRasterTileRowStep; + buffers.pDepth = startBufferRow.pDepth; + + startBufferRow.pStencil += RT::stencilRasterTileRowStep; + buffers.pStencil = startBufferRow.pStencil; +} +