swr/rast: Thread locked tiles improvement
- Change tilemgr TILE_ID encoding to use Morton-order (Z-order). - Change locked tiles set to bitset. Makes clear, set, get much faster. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
8238c791dc
commit
4e52cb51b5
|
@ -42,6 +42,7 @@
|
|||
#include "core/tilemgr.h"
|
||||
#include "core/clip.h"
|
||||
#include "core/utils.h"
|
||||
#include "core/tileset.h"
|
||||
|
||||
#include "common/os.h"
|
||||
|
||||
|
@ -139,6 +140,11 @@ HANDLE SwrCreateContext(
|
|||
BindApiThread(pContext, 0);
|
||||
}
|
||||
|
||||
if (pContext->threadInfo.SINGLE_THREADED)
|
||||
{
|
||||
pContext->pSingleThreadLockedTiles = new TileSet();
|
||||
}
|
||||
|
||||
pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
|
||||
pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
|
||||
|
||||
|
@ -245,7 +251,7 @@ void QueueWork(SWR_CONTEXT *pContext)
|
|||
{
|
||||
uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
|
||||
WorkOnFifoFE(pContext, 0, curDraw[0]);
|
||||
WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
|
||||
WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -427,7 +433,8 @@ void SwrDestroyContext(HANDLE hContext)
|
|||
delete[] pContext->ppScratch;
|
||||
AlignedFree(pContext->pStats);
|
||||
|
||||
delete(pContext->pHotTileMgr);
|
||||
delete pContext->pHotTileMgr;
|
||||
delete pContext->pSingleThreadLockedTiles;
|
||||
|
||||
pContext->~SWR_CONTEXT();
|
||||
AlignedFree(GetContext(hContext));
|
||||
|
|
|
@ -516,7 +516,7 @@ struct SWR_CONTEXT
|
|||
|
||||
uint32_t lastFrameChecked;
|
||||
uint64_t lastDrawChecked;
|
||||
TileSet singleThreadLockedTiles;
|
||||
TileSet* pSingleThreadLockedTiles;
|
||||
|
||||
// ArchRast thread contexts.
|
||||
HANDLE* pArContext;
|
||||
|
|
|
@ -49,6 +49,7 @@
|
|||
#include "rasterizer.h"
|
||||
#include "rdtsc_core.h"
|
||||
#include "tilemgr.h"
|
||||
#include "tileset.h"
|
||||
|
||||
|
||||
|
||||
|
@ -587,7 +588,7 @@ bool WorkOnFifoBE(
|
|||
}
|
||||
|
||||
// can only work on this draw if it's not in use by other threads
|
||||
if (lockedTiles.find(tileID) != lockedTiles.end())
|
||||
if (lockedTiles.get(tileID))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -645,7 +646,7 @@ bool WorkOnFifoBE(
|
|||
else
|
||||
{
|
||||
// This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
|
||||
lockedTiles.insert(tileID);
|
||||
lockedTiles.set(tileID);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -62,7 +62,7 @@ struct THREAD_POOL
|
|||
THREAD_DATA *pApiThreadData;
|
||||
};
|
||||
|
||||
typedef std::unordered_set<uint32_t> TileSet;
|
||||
struct TileSet;
|
||||
|
||||
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
|
||||
void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
|
||||
|
|
|
@ -33,8 +33,6 @@
|
|||
#include "core/multisample.h"
|
||||
#include "rdtsc_core.h"
|
||||
|
||||
#define TILE_ID(x,y) ((x << 16 | y))
|
||||
|
||||
MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
|
||||
{
|
||||
}
|
||||
|
@ -50,26 +48,35 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
|
|||
return;
|
||||
}
|
||||
|
||||
uint32_t id = TILE_ID(x, y);
|
||||
uint32_t id = getTileId(x, y);
|
||||
|
||||
MacroTileQueue &tile = mTiles[id];
|
||||
tile.mWorkItemsFE++;
|
||||
tile.mId = id;
|
||||
|
||||
if (tile.mWorkItemsFE == 1)
|
||||
if (id >= mTiles.size())
|
||||
{
|
||||
tile.clear(mArena);
|
||||
mDirtyTiles.push_back(&tile);
|
||||
mTiles.resize((16 + id) * 2);
|
||||
}
|
||||
|
||||
MacroTileQueue *pTile = mTiles[id];
|
||||
if (!pTile)
|
||||
{
|
||||
pTile = mTiles[id] = new MacroTileQueue();
|
||||
}
|
||||
pTile->mWorkItemsFE++;
|
||||
pTile->mId = id;
|
||||
|
||||
if (pTile->mWorkItemsFE == 1)
|
||||
{
|
||||
pTile->clear(mArena);
|
||||
mDirtyTiles.push_back(pTile);
|
||||
}
|
||||
|
||||
mWorkItemsProduced++;
|
||||
tile.enqueue_try_nosync(mArena, pWork);
|
||||
pTile->enqueue_try_nosync(mArena, pWork);
|
||||
}
|
||||
|
||||
void MacroTileMgr::markTileComplete(uint32_t id)
|
||||
{
|
||||
SWR_ASSERT(mTiles.find(id) != mTiles.end());
|
||||
MacroTileQueue &tile = mTiles[id];
|
||||
SWR_ASSERT(mTiles.size() > id);
|
||||
MacroTileQueue &tile = *mTiles[id];
|
||||
uint32_t numTiles = tile.mWorkItemsFE;
|
||||
InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include "common/formats.h"
|
||||
#include "common/intrin.h"
|
||||
#include "fifo.hpp"
|
||||
#include "context.h"
|
||||
#include "format_traits.h"
|
||||
|
@ -41,7 +42,7 @@
|
|||
struct MacroTileQueue
|
||||
{
|
||||
MacroTileQueue() { }
|
||||
~MacroTileQueue() { }
|
||||
~MacroTileQueue() { destroy(); }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Returns number of work items queued for this tile.
|
||||
|
@ -110,9 +111,9 @@ public:
|
|||
MacroTileMgr(CachingArena& arena);
|
||||
~MacroTileMgr()
|
||||
{
|
||||
for (auto &tile : mTiles)
|
||||
for (auto *pTile : mTiles)
|
||||
{
|
||||
tile.second.destroy();
|
||||
delete pTile;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -136,13 +137,20 @@ public:
|
|||
|
||||
static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
|
||||
{
|
||||
y = tileID & 0xffff;
|
||||
x = (tileID >> 16) & 0xffff;
|
||||
// Morton / Z order of tiles
|
||||
x = pext_u32(tileID, 0x55555555);
|
||||
y = pext_u32(tileID, 0xAAAAAAAA);
|
||||
}
|
||||
|
||||
static INLINE uint32_t getTileId(uint32_t x, uint32_t y)
|
||||
{
|
||||
// Morton / Z order of tiles
|
||||
return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA);
|
||||
}
|
||||
|
||||
private:
|
||||
CachingArena& mArena;
|
||||
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
|
||||
std::vector<MacroTileQueue*> mTiles;
|
||||
|
||||
// Any tile that has work queued to it is a dirty tile.
|
||||
std::vector<MacroTileQueue*> mDirtyTiles;
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file tileset.h
|
||||
*
|
||||
* @brief Custom bitset class for managing locked tiles
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
struct TileSet
|
||||
{
|
||||
~TileSet()
|
||||
{
|
||||
if (m_bits)
|
||||
{
|
||||
AlignedFree(m_bits);
|
||||
}
|
||||
}
|
||||
INLINE void set(size_t idx)
|
||||
{
|
||||
_grow(idx);
|
||||
size_t& word = _get_word(idx);
|
||||
word |= (size_t(1) << (idx & BITS_OFFSET));
|
||||
m_maxSet = std::max(m_maxSet, idx + 1);
|
||||
}
|
||||
INLINE bool get(size_t idx)
|
||||
{
|
||||
if (idx >= m_size)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
size_t word = _get_word(idx);
|
||||
return 0 != (word & (size_t(1) << (idx & BITS_OFFSET)));
|
||||
}
|
||||
|
||||
INLINE void clear()
|
||||
{
|
||||
if (m_maxSet)
|
||||
{
|
||||
size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD;
|
||||
memset(m_bits, 0, sizeof(size_t) * num_words);
|
||||
m_maxSet = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
|
||||
static const size_t BITS_OFFSET = BITS_PER_WORD - 1;
|
||||
|
||||
size_t m_size = 0;
|
||||
size_t m_maxSet = 0;
|
||||
size_t* m_bits = nullptr;
|
||||
|
||||
INLINE size_t& _get_word(size_t idx)
|
||||
{
|
||||
return m_bits[idx / BITS_PER_WORD];
|
||||
}
|
||||
|
||||
void _grow(size_t idx)
|
||||
{
|
||||
if (idx < m_size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET;
|
||||
size_t num_words = new_size / BITS_PER_WORD;
|
||||
size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64);
|
||||
size_t copy_words = 0;
|
||||
|
||||
if (m_bits)
|
||||
{
|
||||
copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD;
|
||||
num_words -= copy_words;
|
||||
memcpy(newBits, m_bits, copy_words * sizeof(size_t));
|
||||
|
||||
AlignedFree(m_bits);
|
||||
}
|
||||
|
||||
m_bits = newBits;
|
||||
m_size = new_size;
|
||||
|
||||
memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words);
|
||||
}
|
||||
};
|
Loading…
Reference in New Issue