swr/rast: Thread locked tiles improvement

- Change tilemgr TILE_ID encoding to use Morton-order (Z-order). - Change locked tiles set to bitset. Makes clear, set, get much faster. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
2018-05-01 19:33:38 -05:00 · 2018-05-01 19:33:38 -05:00 · 4e52cb51b5
parent 8238c791dc
commit 4e52cb51b5
7 changed files with 153 additions and 25 deletions
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@ -42,6 +42,7 @@
 #include "core/tilemgr.h"
 #include "core/clip.h"
 #include "core/utils.h"
+#include "core/tileset.h"

 #include "common/os.h"

@ -139,6 +140,11 @@ HANDLE SwrCreateContext(
        BindApiThread(pContext, 0);
    }

+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        pContext->pSingleThreadLockedTiles = new TileSet();
+    }
+
    pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
    pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);

@ -245,7 +251,7 @@ void QueueWork(SWR_CONTEXT *pContext)
        {
            uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
            WorkOnFifoFE(pContext, 0, curDraw[0]);
-            WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
        }
        else
        {
@ -427,7 +433,8 @@ void SwrDestroyContext(HANDLE hContext)
    delete[] pContext->ppScratch;
    AlignedFree(pContext->pStats);

-    delete(pContext->pHotTileMgr);
+    delete pContext->pHotTileMgr;
+    delete pContext->pSingleThreadLockedTiles;

    pContext->~SWR_CONTEXT();
    AlignedFree(GetContext(hContext));
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@ -516,7 +516,7 @@ struct SWR_CONTEXT

    uint32_t lastFrameChecked;
    uint64_t lastDrawChecked;
-    TileSet singleThreadLockedTiles;
+    TileSet* pSingleThreadLockedTiles;

    // ArchRast thread contexts.
    HANDLE* pArContext;
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@ -49,6 +49,7 @@
 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "tilemgr.h"
+#include "tileset.h"



@ -587,7 +588,7 @@ bool WorkOnFifoBE(
            }

            // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) != lockedTiles.end())
+            if (lockedTiles.get(tileID))
            {
                continue;
            }
@ -645,7 +646,7 @@ bool WorkOnFifoBE(
            else
            {
                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                lockedTiles.insert(tileID);
+                lockedTiles.set(tileID);
            }
        }
    }
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@ -62,7 +62,7 @@ struct THREAD_POOL
    THREAD_DATA *pApiThreadData;
 };

-typedef std::unordered_set<uint32_t> TileSet;
+struct TileSet;

 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@ -33,8 +33,6 @@
 #include "core/multisample.h"
 #include "rdtsc_core.h"

-#define TILE_ID(x,y) ((x << 16 | y))
-
 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
@ -50,26 +48,35 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
        return;
    }

-    uint32_t id = TILE_ID(x, y);
+    uint32_t id = getTileId(x, y);

-    MacroTileQueue &tile = mTiles[id];
-    tile.mWorkItemsFE++;
-    tile.mId = id;
-
-    if (tile.mWorkItemsFE == 1)
+    if (id >= mTiles.size())
    {
-        tile.clear(mArena);
-        mDirtyTiles.push_back(&tile);
+        mTiles.resize((16 + id) * 2);
+    }
+
+    MacroTileQueue *pTile = mTiles[id];
+    if (!pTile)
+    {
+        pTile = mTiles[id] = new MacroTileQueue();
+    }
+    pTile->mWorkItemsFE++;
+    pTile->mId = id;
+
+    if (pTile->mWorkItemsFE == 1)
+    {
+        pTile->clear(mArena);
+        mDirtyTiles.push_back(pTile);
    }

    mWorkItemsProduced++;
-    tile.enqueue_try_nosync(mArena, pWork);
+    pTile->enqueue_try_nosync(mArena, pWork);
 }

 void MacroTileMgr::markTileComplete(uint32_t id)
 {
-    SWR_ASSERT(mTiles.find(id) != mTiles.end());
-    MacroTileQueue &tile = mTiles[id];
+    SWR_ASSERT(mTiles.size() > id);
+    MacroTileQueue &tile = *mTiles[id];
    uint32_t numTiles = tile.mWorkItemsFE;
    InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);

--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@ -31,6 +31,7 @@
 #include <set>
 #include <unordered_map>
 #include "common/formats.h"
+#include "common/intrin.h"
 #include "fifo.hpp"
 #include "context.h"
 #include "format_traits.h"
@ -41,7 +42,7 @@
 struct MacroTileQueue
 {
    MacroTileQueue() { }
-    ~MacroTileQueue() { }
+    ~MacroTileQueue() { destroy(); }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Returns number of work items queued for this tile.
@ -110,9 +111,9 @@ public:
    MacroTileMgr(CachingArena& arena);
    ~MacroTileMgr()
    {
-        for (auto &tile : mTiles)
+        for (auto *pTile : mTiles)
        {
-            tile.second.destroy();
+            delete pTile;
        }
    }

@ -136,13 +137,20 @@ public:

    static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
    {
-        y = tileID & 0xffff;
-        x = (tileID >> 16) & 0xffff;
+        // Morton / Z order of tiles
+        x = pext_u32(tileID, 0x55555555);
+        y = pext_u32(tileID, 0xAAAAAAAA);
+    }
+
+    static INLINE uint32_t getTileId(uint32_t x, uint32_t y)
+    {
+        // Morton / Z order of tiles
+        return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA);
    }

 private:
    CachingArena& mArena;
-    std::unordered_map<uint32_t, MacroTileQueue> mTiles;
+    std::vector<MacroTileQueue*> mTiles;

    // Any tile that has work queued to it is a dirty tile.
    std::vector<MacroTileQueue*> mDirtyTiles;
--- a/src/gallium/drivers/swr/rasterizer/core/tileset.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tileset.h
@ -0,0 +1,105 @@
+/****************************************************************************
+* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tileset.h
+*
+* @brief Custom bitset class for managing locked tiles
+*
+******************************************************************************/
+#pragma once
+
+struct TileSet
+{
+    ~TileSet()
+    {
+        if (m_bits)
+        {
+            AlignedFree(m_bits);
+        }
+    }
+    INLINE void set(size_t idx)
+    {
+        _grow(idx);
+        size_t& word = _get_word(idx);
+        word |= (size_t(1) << (idx & BITS_OFFSET));
+        m_maxSet = std::max(m_maxSet, idx + 1);
+    }
+    INLINE bool get(size_t idx)
+    {
+        if (idx >= m_size)
+        {
+            return false;
+        }
+        size_t word = _get_word(idx);
+        return 0 != (word & (size_t(1) << (idx & BITS_OFFSET)));
+    }
+
+    INLINE void clear()
+    {
+        if (m_maxSet)
+        {
+            size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD;
+            memset(m_bits, 0, sizeof(size_t) * num_words);
+            m_maxSet = 0;
+        }
+    }
+
+private:
+    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
+    static const size_t BITS_OFFSET = BITS_PER_WORD - 1;
+
+    size_t              m_size = 0;
+    size_t              m_maxSet = 0;
+    size_t*             m_bits = nullptr;
+
+    INLINE size_t& _get_word(size_t idx)
+    {
+        return m_bits[idx / BITS_PER_WORD];
+    }
+
+    void _grow(size_t idx)
+    {
+        if (idx < m_size)
+        {
+            return;
+        }
+
+        size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET;
+        size_t num_words = new_size / BITS_PER_WORD;
+        size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64);
+        size_t copy_words = 0;
+
+        if (m_bits)
+        {
+            copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD;
+            num_words -= copy_words;
+            memcpy(newBits, m_bits, copy_words * sizeof(size_t));
+
+            AlignedFree(m_bits);
+        }
+
+        m_bits = newBits;
+        m_size = new_size;
+
+        memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words);
+    }
+};