swr/rast: Thread locked tiles improvement
- Change tilemgr TILE_ID encoding to use Morton-order (Z-order). - Change locked tiles set to bitset. Makes clear, set, get much faster. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
8238c791dc
commit
4e52cb51b5
|
@ -42,6 +42,7 @@
|
||||||
#include "core/tilemgr.h"
|
#include "core/tilemgr.h"
|
||||||
#include "core/clip.h"
|
#include "core/clip.h"
|
||||||
#include "core/utils.h"
|
#include "core/utils.h"
|
||||||
|
#include "core/tileset.h"
|
||||||
|
|
||||||
#include "common/os.h"
|
#include "common/os.h"
|
||||||
|
|
||||||
|
@ -139,6 +140,11 @@ HANDLE SwrCreateContext(
|
||||||
BindApiThread(pContext, 0);
|
BindApiThread(pContext, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pContext->threadInfo.SINGLE_THREADED)
|
||||||
|
{
|
||||||
|
pContext->pSingleThreadLockedTiles = new TileSet();
|
||||||
|
}
|
||||||
|
|
||||||
pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
|
pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
|
||||||
pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
|
pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
|
||||||
|
|
||||||
|
@ -245,7 +251,7 @@ void QueueWork(SWR_CONTEXT *pContext)
|
||||||
{
|
{
|
||||||
uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
|
uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
|
||||||
WorkOnFifoFE(pContext, 0, curDraw[0]);
|
WorkOnFifoFE(pContext, 0, curDraw[0]);
|
||||||
WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
|
WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -427,7 +433,8 @@ void SwrDestroyContext(HANDLE hContext)
|
||||||
delete[] pContext->ppScratch;
|
delete[] pContext->ppScratch;
|
||||||
AlignedFree(pContext->pStats);
|
AlignedFree(pContext->pStats);
|
||||||
|
|
||||||
delete(pContext->pHotTileMgr);
|
delete pContext->pHotTileMgr;
|
||||||
|
delete pContext->pSingleThreadLockedTiles;
|
||||||
|
|
||||||
pContext->~SWR_CONTEXT();
|
pContext->~SWR_CONTEXT();
|
||||||
AlignedFree(GetContext(hContext));
|
AlignedFree(GetContext(hContext));
|
||||||
|
|
|
@ -516,7 +516,7 @@ struct SWR_CONTEXT
|
||||||
|
|
||||||
uint32_t lastFrameChecked;
|
uint32_t lastFrameChecked;
|
||||||
uint64_t lastDrawChecked;
|
uint64_t lastDrawChecked;
|
||||||
TileSet singleThreadLockedTiles;
|
TileSet* pSingleThreadLockedTiles;
|
||||||
|
|
||||||
// ArchRast thread contexts.
|
// ArchRast thread contexts.
|
||||||
HANDLE* pArContext;
|
HANDLE* pArContext;
|
||||||
|
|
|
@ -49,6 +49,7 @@
|
||||||
#include "rasterizer.h"
|
#include "rasterizer.h"
|
||||||
#include "rdtsc_core.h"
|
#include "rdtsc_core.h"
|
||||||
#include "tilemgr.h"
|
#include "tilemgr.h"
|
||||||
|
#include "tileset.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -587,7 +588,7 @@ bool WorkOnFifoBE(
|
||||||
}
|
}
|
||||||
|
|
||||||
// can only work on this draw if it's not in use by other threads
|
// can only work on this draw if it's not in use by other threads
|
||||||
if (lockedTiles.find(tileID) != lockedTiles.end())
|
if (lockedTiles.get(tileID))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -645,7 +646,7 @@ bool WorkOnFifoBE(
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
|
// This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
|
||||||
lockedTiles.insert(tileID);
|
lockedTiles.set(tileID);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -62,7 +62,7 @@ struct THREAD_POOL
|
||||||
THREAD_DATA *pApiThreadData;
|
THREAD_DATA *pApiThreadData;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef std::unordered_set<uint32_t> TileSet;
|
struct TileSet;
|
||||||
|
|
||||||
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
|
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
|
||||||
void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
|
void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
|
||||||
|
|
|
@ -33,8 +33,6 @@
|
||||||
#include "core/multisample.h"
|
#include "core/multisample.h"
|
||||||
#include "rdtsc_core.h"
|
#include "rdtsc_core.h"
|
||||||
|
|
||||||
#define TILE_ID(x,y) ((x << 16 | y))
|
|
||||||
|
|
||||||
MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
|
MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
@ -50,26 +48,35 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t id = TILE_ID(x, y);
|
uint32_t id = getTileId(x, y);
|
||||||
|
|
||||||
MacroTileQueue &tile = mTiles[id];
|
if (id >= mTiles.size())
|
||||||
tile.mWorkItemsFE++;
|
|
||||||
tile.mId = id;
|
|
||||||
|
|
||||||
if (tile.mWorkItemsFE == 1)
|
|
||||||
{
|
{
|
||||||
tile.clear(mArena);
|
mTiles.resize((16 + id) * 2);
|
||||||
mDirtyTiles.push_back(&tile);
|
}
|
||||||
|
|
||||||
|
MacroTileQueue *pTile = mTiles[id];
|
||||||
|
if (!pTile)
|
||||||
|
{
|
||||||
|
pTile = mTiles[id] = new MacroTileQueue();
|
||||||
|
}
|
||||||
|
pTile->mWorkItemsFE++;
|
||||||
|
pTile->mId = id;
|
||||||
|
|
||||||
|
if (pTile->mWorkItemsFE == 1)
|
||||||
|
{
|
||||||
|
pTile->clear(mArena);
|
||||||
|
mDirtyTiles.push_back(pTile);
|
||||||
}
|
}
|
||||||
|
|
||||||
mWorkItemsProduced++;
|
mWorkItemsProduced++;
|
||||||
tile.enqueue_try_nosync(mArena, pWork);
|
pTile->enqueue_try_nosync(mArena, pWork);
|
||||||
}
|
}
|
||||||
|
|
||||||
void MacroTileMgr::markTileComplete(uint32_t id)
|
void MacroTileMgr::markTileComplete(uint32_t id)
|
||||||
{
|
{
|
||||||
SWR_ASSERT(mTiles.find(id) != mTiles.end());
|
SWR_ASSERT(mTiles.size() > id);
|
||||||
MacroTileQueue &tile = mTiles[id];
|
MacroTileQueue &tile = *mTiles[id];
|
||||||
uint32_t numTiles = tile.mWorkItemsFE;
|
uint32_t numTiles = tile.mWorkItemsFE;
|
||||||
InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
|
InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include "common/formats.h"
|
#include "common/formats.h"
|
||||||
|
#include "common/intrin.h"
|
||||||
#include "fifo.hpp"
|
#include "fifo.hpp"
|
||||||
#include "context.h"
|
#include "context.h"
|
||||||
#include "format_traits.h"
|
#include "format_traits.h"
|
||||||
|
@ -41,7 +42,7 @@
|
||||||
struct MacroTileQueue
|
struct MacroTileQueue
|
||||||
{
|
{
|
||||||
MacroTileQueue() { }
|
MacroTileQueue() { }
|
||||||
~MacroTileQueue() { }
|
~MacroTileQueue() { destroy(); }
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
/// @brief Returns number of work items queued for this tile.
|
/// @brief Returns number of work items queued for this tile.
|
||||||
|
@ -110,9 +111,9 @@ public:
|
||||||
MacroTileMgr(CachingArena& arena);
|
MacroTileMgr(CachingArena& arena);
|
||||||
~MacroTileMgr()
|
~MacroTileMgr()
|
||||||
{
|
{
|
||||||
for (auto &tile : mTiles)
|
for (auto *pTile : mTiles)
|
||||||
{
|
{
|
||||||
tile.second.destroy();
|
delete pTile;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,13 +137,20 @@ public:
|
||||||
|
|
||||||
static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
|
static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
|
||||||
{
|
{
|
||||||
y = tileID & 0xffff;
|
// Morton / Z order of tiles
|
||||||
x = (tileID >> 16) & 0xffff;
|
x = pext_u32(tileID, 0x55555555);
|
||||||
|
y = pext_u32(tileID, 0xAAAAAAAA);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE uint32_t getTileId(uint32_t x, uint32_t y)
|
||||||
|
{
|
||||||
|
// Morton / Z order of tiles
|
||||||
|
return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
CachingArena& mArena;
|
CachingArena& mArena;
|
||||||
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
|
std::vector<MacroTileQueue*> mTiles;
|
||||||
|
|
||||||
// Any tile that has work queued to it is a dirty tile.
|
// Any tile that has work queued to it is a dirty tile.
|
||||||
std::vector<MacroTileQueue*> mDirtyTiles;
|
std::vector<MacroTileQueue*> mDirtyTiles;
|
||||||
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
/****************************************************************************
|
||||||
|
* Copyright (C) 2018 Intel Corporation. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
* @file tileset.h
|
||||||
|
*
|
||||||
|
* @brief Custom bitset class for managing locked tiles
|
||||||
|
*
|
||||||
|
******************************************************************************/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
struct TileSet
|
||||||
|
{
|
||||||
|
~TileSet()
|
||||||
|
{
|
||||||
|
if (m_bits)
|
||||||
|
{
|
||||||
|
AlignedFree(m_bits);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
INLINE void set(size_t idx)
|
||||||
|
{
|
||||||
|
_grow(idx);
|
||||||
|
size_t& word = _get_word(idx);
|
||||||
|
word |= (size_t(1) << (idx & BITS_OFFSET));
|
||||||
|
m_maxSet = std::max(m_maxSet, idx + 1);
|
||||||
|
}
|
||||||
|
INLINE bool get(size_t idx)
|
||||||
|
{
|
||||||
|
if (idx >= m_size)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
size_t word = _get_word(idx);
|
||||||
|
return 0 != (word & (size_t(1) << (idx & BITS_OFFSET)));
|
||||||
|
}
|
||||||
|
|
||||||
|
INLINE void clear()
|
||||||
|
{
|
||||||
|
if (m_maxSet)
|
||||||
|
{
|
||||||
|
size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD;
|
||||||
|
memset(m_bits, 0, sizeof(size_t) * num_words);
|
||||||
|
m_maxSet = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
|
||||||
|
static const size_t BITS_OFFSET = BITS_PER_WORD - 1;
|
||||||
|
|
||||||
|
size_t m_size = 0;
|
||||||
|
size_t m_maxSet = 0;
|
||||||
|
size_t* m_bits = nullptr;
|
||||||
|
|
||||||
|
INLINE size_t& _get_word(size_t idx)
|
||||||
|
{
|
||||||
|
return m_bits[idx / BITS_PER_WORD];
|
||||||
|
}
|
||||||
|
|
||||||
|
void _grow(size_t idx)
|
||||||
|
{
|
||||||
|
if (idx < m_size)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET;
|
||||||
|
size_t num_words = new_size / BITS_PER_WORD;
|
||||||
|
size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64);
|
||||||
|
size_t copy_words = 0;
|
||||||
|
|
||||||
|
if (m_bits)
|
||||||
|
{
|
||||||
|
copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD;
|
||||||
|
num_words -= copy_words;
|
||||||
|
memcpy(newBits, m_bits, copy_words * sizeof(size_t));
|
||||||
|
|
||||||
|
AlignedFree(m_bits);
|
||||||
|
}
|
||||||
|
|
||||||
|
m_bits = newBits;
|
||||||
|
m_size = new_size;
|
||||||
|
|
||||||
|
memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words);
|
||||||
|
}
|
||||||
|
};
|
Loading…
Reference in New Issue