swr/rast: Rework thread binding parameters for machine partitioning
Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to SwrCreateContext. Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to control reservation of API threads. Add SwrBindApiThread() function to allow binding of API threads to reserved HW threads. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
182cc51a50
commit
20f9006603
|
@ -62,15 +62,33 @@ KNOBS = [
|
|||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['MAX_NUMA_NODES', {
|
||||
['BASE_NUMA_NODE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Starting NUMA node index to use when allocating compute resources.',
|
||||
'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
|
||||
'category' : 'perf',
|
||||
'advanced' : True,
|
||||
}],
|
||||
|
||||
['MAX_NUMA_NODES', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '1' if sys.platform == 'win32' else '0',
|
||||
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
|
||||
' 0 == ALL NUMA-nodes in the system',
|
||||
' N == Use at most N NUMA-nodes for rendering'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['BASE_CORE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Starting core index to use when allocating compute resources.',
|
||||
'Setting this to a non-zero value will reduce the maximum # of cores used.'],
|
||||
'category' : 'perf',
|
||||
'advanced' : True,
|
||||
}],
|
||||
|
||||
['MAX_CORES_PER_NUMA_NODE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
|
@ -80,6 +98,15 @@ KNOBS = [
|
|||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['BASE_THREAD', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Starting thread index to use when allocating compute resources.',
|
||||
'Setting this to a non-zero value will reduce the maximum # of threads used.'],
|
||||
'category' : 'perf',
|
||||
'advanced' : True,
|
||||
}],
|
||||
|
||||
['MAX_THREADS_PER_CORE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '1',
|
||||
|
|
|
@ -95,16 +95,32 @@ HANDLE SwrCreateContext(
|
|||
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
|
||||
}
|
||||
|
||||
pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
|
||||
pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
|
||||
pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
|
||||
pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
|
||||
pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
|
||||
|
||||
if (pCreateInfo->pThreadInfo)
|
||||
{
|
||||
pContext->threadInfo = *pCreateInfo->pThreadInfo;
|
||||
}
|
||||
else
|
||||
{
|
||||
pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
|
||||
pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE;
|
||||
pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE;
|
||||
pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD;
|
||||
pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
|
||||
pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
|
||||
pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
|
||||
pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
|
||||
}
|
||||
|
||||
if (pCreateInfo->pApiThreadInfo)
|
||||
{
|
||||
pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
|
||||
}
|
||||
else
|
||||
{
|
||||
pContext->apiThreadInfo.bindAPIThread0 = true;
|
||||
pContext->apiThreadInfo.numAPIReservedThreads = 1;
|
||||
pContext->apiThreadInfo.numAPIThreadsPerCore = 1;
|
||||
}
|
||||
|
||||
memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
|
||||
memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
|
||||
|
@ -113,6 +129,11 @@ HANDLE SwrCreateContext(
|
|||
|
||||
CreateThreadPool(pContext, &pContext->threadPool);
|
||||
|
||||
if (pContext->apiThreadInfo.bindAPIThread0)
|
||||
{
|
||||
BindApiThread(pContext, 0);
|
||||
}
|
||||
|
||||
pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
|
||||
pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
|
||||
|
||||
|
@ -407,6 +428,12 @@ void SwrDestroyContext(HANDLE hContext)
|
|||
AlignedFree(GetContext(hContext));
|
||||
}
|
||||
|
||||
void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
|
||||
{
|
||||
SWR_CONTEXT *pContext = GetContext(hContext);
|
||||
BindApiThread(pContext, apiThreadId);
|
||||
}
|
||||
|
||||
void SWR_API SwrSaveState(
|
||||
HANDLE hContext,
|
||||
void* pOutputStateBlock,
|
||||
|
@ -1688,6 +1715,7 @@ void SwrGetInterface(SWR_INTERFACE &out_funcs)
|
|||
{
|
||||
out_funcs.pfnSwrCreateContext = SwrCreateContext;
|
||||
out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
|
||||
out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
|
||||
out_funcs.pfnSwrSaveState = SwrSaveState;
|
||||
out_funcs.pfnSwrRestoreState = SwrRestoreState;
|
||||
out_funcs.pfnSwrSync = SwrSync;
|
||||
|
|
|
@ -181,6 +181,9 @@ class BucketManager;
|
|||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_THREADING_INFO
|
||||
{
|
||||
uint32_t BASE_NUMA_NODE;
|
||||
uint32_t BASE_CORE;
|
||||
uint32_t BASE_THREAD;
|
||||
uint32_t MAX_WORKER_THREADS;
|
||||
uint32_t MAX_NUMA_NODES;
|
||||
uint32_t MAX_CORES_PER_NUMA_NODE;
|
||||
|
@ -188,6 +191,24 @@ struct SWR_THREADING_INFO
|
|||
bool SINGLE_THREADED;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_API_THREADING_INFO
|
||||
/// Data used to reserve HW threads for API use
|
||||
/// API Threads are reserved from numa nodes / cores used for
|
||||
/// SWR Worker threads. Specifying reserved threads here can reduce
|
||||
/// the total number of SWR worker threads.
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_API_THREADING_INFO
|
||||
{
|
||||
uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
|
||||
uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0,
|
||||
// binds thread used in SwrCreateContext to API Reserved
|
||||
// thread 0
|
||||
uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
|
||||
// Independent of KNOB_MAX_THREADS_PER_CORE.
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_CREATECONTEXT_INFO
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
@ -219,6 +240,9 @@ struct SWR_CREATECONTEXT_INFO
|
|||
// Input (optional): Threading info that overrides any set KNOB values.
|
||||
SWR_THREADING_INFO* pThreadInfo;
|
||||
|
||||
// Input (optional}: Info for reserving API threads
|
||||
SWR_API_THREADING_INFO* pApiThreadInfo;
|
||||
|
||||
// Input: if set to non-zero value, overrides KNOB value for maximum
|
||||
// number of draws in flight
|
||||
uint32_t MAX_DRAWS_IN_FLIGHT;
|
||||
|
@ -236,6 +260,14 @@ SWR_FUNC(HANDLE, SwrCreateContext,
|
|||
SWR_FUNC(void, SwrDestroyContext,
|
||||
HANDLE hContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Bind current thread to an API reserved HW thread
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param apiThreadId - index of reserved HW thread to bind to.
|
||||
SWR_FUNC(void, SwrBindApiThread,
|
||||
HANDLE hContext,
|
||||
uint32_t apiThreadId);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Saves API state associated with hContext
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
|
@ -720,6 +752,7 @@ struct SWR_INTERFACE
|
|||
{
|
||||
PFNSwrCreateContext pfnSwrCreateContext;
|
||||
PFNSwrDestroyContext pfnSwrDestroyContext;
|
||||
PFNSwrBindApiThread pfnSwrBindApiThread;
|
||||
PFNSwrSaveState pfnSwrSaveState;
|
||||
PFNSwrRestoreState pfnSwrRestoreState;
|
||||
PFNSwrSync pfnSwrSync;
|
||||
|
|
|
@ -480,6 +480,7 @@ struct SWR_CONTEXT
|
|||
|
||||
THREAD_POOL threadPool; // Thread pool associated with this context
|
||||
SWR_THREADING_INFO threadInfo;
|
||||
SWR_API_THREADING_INFO apiThreadInfo;
|
||||
|
||||
uint32_t MAX_DRAWS_IN_FLIGHT;
|
||||
|
||||
|
|
|
@ -284,13 +284,20 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId =
|
|||
{
|
||||
// If MAX_WORKER_THREADS is set, only bind to the proc group,
|
||||
// Not the individual HW thread.
|
||||
if (!pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
{
|
||||
affinity.Mask = KAFFINITY(1) << threadId;
|
||||
}
|
||||
else
|
||||
{
|
||||
affinity.Mask = KAFFINITY(0);
|
||||
}
|
||||
}
|
||||
|
||||
SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
|
||||
if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
|
||||
{
|
||||
SWR_INVALID("Failed to set Thread Affinity");
|
||||
}
|
||||
|
||||
#elif defined(__linux__) || defined(__gnu_linux__)
|
||||
|
||||
|
@ -727,6 +734,29 @@ void WorkOnCompute(
|
|||
}
|
||||
}
|
||||
|
||||
void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
|
||||
{
|
||||
if (nullptr == pContext)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (apiThreadId >= pContext->threadPool.numReservedThreads)
|
||||
{
|
||||
if (pContext->threadPool.numReservedThreads)
|
||||
{
|
||||
const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
|
||||
// Just bind to the process group used for API thread 0
|
||||
bindThread(pContext, 0, threadData.procGroupId, true);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
|
||||
|
||||
bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
|
||||
}
|
||||
|
||||
template<bool IsFEThread, bool IsBEThread>
|
||||
DWORD workerThreadMain(LPVOID pData)
|
||||
{
|
||||
|
@ -752,7 +782,8 @@ DWORD workerThreadMain(LPVOID pData)
|
|||
|
||||
RDTSC_INIT(threadId);
|
||||
|
||||
uint32_t numaNode = pThreadData->numaId;
|
||||
// Only need offset numa index from base for correct masking
|
||||
uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
|
||||
uint32_t numaMask = pContext->threadPool.numaMask;
|
||||
|
||||
// flush denormals to 0
|
||||
|
@ -861,28 +892,50 @@ DWORD workerThreadInit(LPVOID pData)
|
|||
}
|
||||
template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
|
||||
|
||||
static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
|
||||
{
|
||||
// Initialize DRAW_CONTEXT's per-thread stats
|
||||
for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
|
||||
{
|
||||
pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
|
||||
memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Creates thread pool info but doesn't launch threads.
|
||||
/// @param pContext - pointer to context
|
||||
/// @param pPool - pointer to thread pool object.
|
||||
void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
|
||||
{
|
||||
bindThread(pContext, 0);
|
||||
|
||||
CPUNumaNodes nodes;
|
||||
uint32_t numThreadsPerProcGroup = 0;
|
||||
CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
|
||||
|
||||
// Assumption, for asymmetric topologies, multi-threaded cores will appear
|
||||
// in the list before single-threaded cores. This appears to be true for
|
||||
// Windows when the total HW threads is limited to 64.
|
||||
uint32_t numHWNodes = (uint32_t)nodes.size();
|
||||
uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
|
||||
uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
|
||||
|
||||
#if defined(_WIN32) && !defined(_WIN64)
|
||||
if (!pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
{
|
||||
// Limit 32-bit windows to bindable HW threads only
|
||||
if ((numHWCoresPerNode * numHWHyperThreads) > 32)
|
||||
{
|
||||
numHWCoresPerNode = 32 / numHWHyperThreads;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Calculate num HW threads. Due to asymmetric topologies, this is not
|
||||
// a trivial multiplication.
|
||||
uint32_t numHWThreads = 0;
|
||||
for (auto& node : nodes)
|
||||
for (auto const& node : nodes)
|
||||
{
|
||||
for (auto& core : node.cores)
|
||||
for (auto const& core : node.cores)
|
||||
{
|
||||
numHWThreads += (uint32_t)core.threadIds.size();
|
||||
}
|
||||
|
@ -892,14 +945,19 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
|
|||
uint32_t numCoresPerNode = numHWCoresPerNode;
|
||||
uint32_t numHyperThreads = numHWHyperThreads;
|
||||
|
||||
if (pContext->threadInfo.MAX_NUMA_NODES)
|
||||
// Calc used threads per-core
|
||||
if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
|
||||
{
|
||||
numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
|
||||
numHyperThreads -= pContext->threadInfo.BASE_THREAD;
|
||||
}
|
||||
|
||||
if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
|
||||
else
|
||||
{
|
||||
numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
|
||||
SWR_ASSERT(
|
||||
false,
|
||||
"Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
|
||||
pContext->threadInfo.BASE_THREAD,
|
||||
numHyperThreads);
|
||||
pContext->threadInfo.BASE_THREAD = 0;
|
||||
}
|
||||
|
||||
if (pContext->threadInfo.MAX_THREADS_PER_CORE)
|
||||
|
@ -907,93 +965,139 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
|
|||
numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
|
||||
}
|
||||
|
||||
#if defined(_WIN32) && !defined(_WIN64)
|
||||
if (!pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
// Prune any cores that don't support the number of threads
|
||||
if (numHyperThreads > 1)
|
||||
{
|
||||
// Limit 32-bit windows to bindable HW threads only
|
||||
if ((numCoresPerNode * numHWHyperThreads) > 32)
|
||||
for (auto& node : nodes)
|
||||
{
|
||||
numCoresPerNode = 32 / numHWHyperThreads;
|
||||
uint32_t numUsableCores = 0;
|
||||
for (auto& core : node.cores)
|
||||
{
|
||||
numUsableCores += (core.threadIds.size() >= numHyperThreads);
|
||||
}
|
||||
numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Calculate numThreads
|
||||
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
|
||||
numThreads = std::min(numThreads, numHWThreads);
|
||||
|
||||
if (pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
// Calc used cores per NUMA node
|
||||
if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
|
||||
{
|
||||
uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
|
||||
numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
|
||||
}
|
||||
|
||||
uint32_t numAPIReservedThreads = 1;
|
||||
|
||||
|
||||
if (numThreads == 1)
|
||||
{
|
||||
// If only 1 worker threads, try to move it to an available
|
||||
// HW thread. If that fails, use the API thread.
|
||||
if (numCoresPerNode < numHWCoresPerNode)
|
||||
{
|
||||
numCoresPerNode++;
|
||||
}
|
||||
else if (numHyperThreads < numHWHyperThreads)
|
||||
{
|
||||
numHyperThreads++;
|
||||
}
|
||||
else if (numNodes < numHWNodes)
|
||||
{
|
||||
numNodes++;
|
||||
}
|
||||
else
|
||||
{
|
||||
pContext->threadInfo.SINGLE_THREADED = true;
|
||||
}
|
||||
numCoresPerNode -= pContext->threadInfo.BASE_CORE;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Save HW threads for the API if we can
|
||||
if (numThreads > numAPIReservedThreads)
|
||||
{
|
||||
numThreads -= numAPIReservedThreads;
|
||||
}
|
||||
else
|
||||
{
|
||||
numAPIReservedThreads = 0;
|
||||
}
|
||||
SWR_ASSERT(
|
||||
false,
|
||||
"Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
|
||||
pContext->threadInfo.BASE_CORE,
|
||||
numCoresPerNode);
|
||||
pContext->threadInfo.BASE_CORE = 0;
|
||||
}
|
||||
|
||||
if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
|
||||
{
|
||||
numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
|
||||
}
|
||||
|
||||
// Calc used NUMA nodes
|
||||
if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
|
||||
{
|
||||
numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
|
||||
}
|
||||
else
|
||||
{
|
||||
SWR_ASSERT(
|
||||
false,
|
||||
"Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
|
||||
pContext->threadInfo.BASE_NUMA_NODE,
|
||||
numNodes);
|
||||
pContext->threadInfo.BASE_NUMA_NODE = 0;
|
||||
}
|
||||
|
||||
if (pContext->threadInfo.MAX_NUMA_NODES)
|
||||
{
|
||||
numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
|
||||
}
|
||||
|
||||
// Calculate numThreads - at this point everything should be symmetric
|
||||
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
|
||||
SWR_REL_ASSERT(numThreads <= numHWThreads);
|
||||
|
||||
uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
|
||||
uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
|
||||
uint32_t numRemovedThreads = 0;
|
||||
|
||||
if (pContext->threadInfo.SINGLE_THREADED)
|
||||
{
|
||||
numAPIReservedThreads = 0;
|
||||
numThreads = 1;
|
||||
}
|
||||
|
||||
// Initialize DRAW_CONTEXT's per-thread stats
|
||||
for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
|
||||
{
|
||||
pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
|
||||
memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
|
||||
}
|
||||
|
||||
if (pContext->threadInfo.SINGLE_THREADED)
|
||||
{
|
||||
pContext->NumWorkerThreads = 1;
|
||||
pContext->NumFEThreads = 1;
|
||||
pContext->NumBEThreads = 1;
|
||||
pPool->numThreads = 0;
|
||||
}
|
||||
else if (pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
{
|
||||
numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
|
||||
pContext->threadInfo.BASE_NUMA_NODE = 0;
|
||||
pContext->threadInfo.BASE_CORE = 0;
|
||||
pContext->threadInfo.BASE_THREAD = 0;
|
||||
numAPIReservedThreads = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (numAPIReservedThreads >= numThreads)
|
||||
{
|
||||
numAPIReservedThreads = 0;
|
||||
}
|
||||
else if (numAPIReservedThreads)
|
||||
{
|
||||
numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
|
||||
|
||||
if (0 == numAPIThreadsPerCore)
|
||||
{
|
||||
numAPIThreadsPerCore = numHWHyperThreads;
|
||||
}
|
||||
|
||||
numRemovedThreads = numAPIReservedThreads;
|
||||
if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
|
||||
{
|
||||
// Adjust removed threads to make logic below work
|
||||
numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
|
||||
}
|
||||
|
||||
numThreads -= numRemovedThreads;
|
||||
}
|
||||
}
|
||||
|
||||
InitPerThreadStats(pContext, numThreads);
|
||||
|
||||
if (pContext->threadInfo.SINGLE_THREADED)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (numAPIReservedThreads)
|
||||
{
|
||||
pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
|
||||
SWR_ASSERT(pPool->pApiThreadData);
|
||||
if (!pPool->pApiThreadData)
|
||||
{
|
||||
numAPIReservedThreads = 0;
|
||||
}
|
||||
}
|
||||
pPool->numReservedThreads = numAPIReservedThreads;
|
||||
|
||||
pPool->numThreads = numThreads;
|
||||
pContext->NumWorkerThreads = pPool->numThreads;
|
||||
|
||||
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
|
||||
pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
|
||||
SWR_ASSERT(pPool->pThreadData);
|
||||
pPool->numaMask = 0;
|
||||
|
||||
pPool->pThreads = new THREAD_PTR[pPool->numThreads];
|
||||
|
||||
pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
|
||||
SWR_ASSERT(pPool->pThreads);
|
||||
|
||||
if (pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
{
|
||||
|
@ -1021,37 +1125,72 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
|
|||
// numa distribution assumes workers on all nodes
|
||||
bool useNuma = true;
|
||||
if (numCoresPerNode * numHyperThreads == 1)
|
||||
{
|
||||
useNuma = false;
|
||||
}
|
||||
|
||||
if (useNuma) {
|
||||
if (useNuma)
|
||||
{
|
||||
pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
pPool->numaMask = 0;
|
||||
}
|
||||
|
||||
uint32_t workerId = 0;
|
||||
uint32_t numReservedThreads = numAPIReservedThreads;
|
||||
for (uint32_t n = 0; n < numNodes; ++n)
|
||||
{
|
||||
auto& node = nodes[n];
|
||||
if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
|
||||
uint32_t numCores = numCoresPerNode;
|
||||
for (uint32_t c = 0; c < numCores; ++c)
|
||||
{
|
||||
if (c >= node.cores.size())
|
||||
if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
auto& core = node.cores[c];
|
||||
auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
|
||||
for (uint32_t t = 0; t < numHyperThreads; ++t)
|
||||
{
|
||||
if (t >= core.threadIds.size())
|
||||
if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (numAPIReservedThreads)
|
||||
if (numRemovedThreads)
|
||||
{
|
||||
--numAPIReservedThreads;
|
||||
--numRemovedThreads;
|
||||
SWR_REL_ASSERT(numReservedThreads);
|
||||
--numReservedThreads;
|
||||
pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
|
||||
pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
|
||||
pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
|
||||
pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
|
||||
pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
|
||||
pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
|
||||
pPool->pApiThreadData[numReservedThreads].pContext = pContext;
|
||||
pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
|
||||
|
||||
|
||||
if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
|
||||
{
|
||||
--numReservedThreads;
|
||||
pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
|
||||
pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
|
||||
pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
|
||||
pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
|
||||
pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
|
||||
pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
|
||||
pPool->pApiThreadData[numReservedThreads].pContext = pContext;
|
||||
pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -1059,11 +1198,12 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
|
|||
|
||||
pPool->pThreadData[workerId].workerId = workerId;
|
||||
pPool->pThreadData[workerId].procGroupId = core.procGroup;
|
||||
pPool->pThreadData[workerId].threadId = core.threadIds[t];
|
||||
pPool->pThreadData[workerId].numaId = useNuma ? n : 0;
|
||||
pPool->pThreadData[workerId].coreId = c;
|
||||
pPool->pThreadData[workerId].htId = t;
|
||||
pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
|
||||
pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
|
||||
pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
|
||||
pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
|
||||
pPool->pThreadData[workerId].pContext = pContext;
|
||||
pPool->pThreadData[workerId].forceBindProcGroup = false;
|
||||
|
||||
pContext->NumBEThreads++;
|
||||
pContext->NumFEThreads++;
|
||||
|
@ -1113,9 +1253,10 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
|||
delete(pPool->pThreads[t]);
|
||||
}
|
||||
|
||||
delete [] pPool->pThreads;
|
||||
delete[] pPool->pThreads;
|
||||
|
||||
// Clean up data used by threads
|
||||
free(pPool->pThreadData);
|
||||
delete[] pPool->pThreadData;
|
||||
delete[] pPool->pApiThreadData;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,6 +55,8 @@ struct THREAD_POOL
|
|||
uint32_t numThreads;
|
||||
uint32_t numaMask;
|
||||
THREAD_DATA *pThreadData;
|
||||
uint32_t numReservedThreads; // Number of threads reserved for API use
|
||||
THREAD_DATA *pApiThreadData;
|
||||
};
|
||||
|
||||
typedef std::unordered_set<uint32_t> TileSet;
|
||||
|
@ -68,3 +70,5 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
|
|||
bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
|
||||
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
|
||||
int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
|
||||
|
||||
void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId);
|
||||
|
|
|
@ -100,7 +100,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
|
|||
{
|
||||
uint32_t size = numSamples * mHotTileSize[attachment];
|
||||
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
|
||||
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
|
||||
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
|
||||
hotTile.state = HOTTILE_INVALID;
|
||||
hotTile.numSamples = numSamples;
|
||||
hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
|
||||
|
@ -124,7 +124,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
|
|||
|
||||
uint32_t size = numSamples * mHotTileSize[attachment];
|
||||
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
|
||||
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
|
||||
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
|
||||
hotTile.state = HOTTILE_INVALID;
|
||||
hotTile.numSamples = numSamples;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue