swr/rast: Rework thread binding parameters for machine partitioning

Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to
SwrCreateContext.

Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to
control reservation of API threads.

Add SwrBindApiThread() function to allow binding of API threads to
reserved HW threads.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Tim Rowley 2017-12-11 17:45:58 -06:00
parent 182cc51a50
commit 20f9006603
7 changed files with 329 additions and 95 deletions

View File

@ -62,15 +62,33 @@ KNOBS = [
'category' : 'perf',
}],
['MAX_NUMA_NODES', {
['BASE_NUMA_NODE', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Starting NUMA node index to use when allocating compute resources.',
'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
'category' : 'perf',
'advanced' : True,
}],
['MAX_NUMA_NODES', {
'type' : 'uint32_t',
'default' : '1' if sys.platform == 'win32' else '0',
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
' 0 == ALL NUMA-nodes in the system',
' N == Use at most N NUMA-nodes for rendering'],
'category' : 'perf',
}],
['BASE_CORE', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Starting core index to use when allocating compute resources.',
'Setting this to a non-zero value will reduce the maximum # of cores used.'],
'category' : 'perf',
'advanced' : True,
}],
['MAX_CORES_PER_NUMA_NODE', {
'type' : 'uint32_t',
'default' : '0',
@ -80,6 +98,15 @@ KNOBS = [
'category' : 'perf',
}],
['BASE_THREAD', {
'type' : 'uint32_t',
'default' : '0',
'desc' : ['Starting thread index to use when allocating compute resources.',
'Setting this to a non-zero value will reduce the maximum # of threads used.'],
'category' : 'perf',
'advanced' : True,
}],
['MAX_THREADS_PER_CORE', {
'type' : 'uint32_t',
'default' : '1',

View File

@ -95,16 +95,32 @@ HANDLE SwrCreateContext(
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
}
pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
if (pCreateInfo->pThreadInfo)
{
pContext->threadInfo = *pCreateInfo->pThreadInfo;
}
else
{
pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE;
pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE;
pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD;
pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
}
if (pCreateInfo->pApiThreadInfo)
{
pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
}
else
{
pContext->apiThreadInfo.bindAPIThread0 = true;
pContext->apiThreadInfo.numAPIReservedThreads = 1;
pContext->apiThreadInfo.numAPIThreadsPerCore = 1;
}
memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@ -113,6 +129,11 @@ HANDLE SwrCreateContext(
CreateThreadPool(pContext, &pContext->threadPool);
if (pContext->apiThreadInfo.bindAPIThread0)
{
BindApiThread(pContext, 0);
}
pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
@ -407,6 +428,12 @@ void SwrDestroyContext(HANDLE hContext)
AlignedFree(GetContext(hContext));
}
void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
{
SWR_CONTEXT *pContext = GetContext(hContext);
BindApiThread(pContext, apiThreadId);
}
void SWR_API SwrSaveState(
HANDLE hContext,
void* pOutputStateBlock,
@ -1688,6 +1715,7 @@ void SwrGetInterface(SWR_INTERFACE &out_funcs)
{
out_funcs.pfnSwrCreateContext = SwrCreateContext;
out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
out_funcs.pfnSwrSaveState = SwrSaveState;
out_funcs.pfnSwrRestoreState = SwrRestoreState;
out_funcs.pfnSwrSync = SwrSync;

View File

@ -181,6 +181,9 @@ class BucketManager;
/////////////////////////////////////////////////////////////////////////
struct SWR_THREADING_INFO
{
uint32_t BASE_NUMA_NODE;
uint32_t BASE_CORE;
uint32_t BASE_THREAD;
uint32_t MAX_WORKER_THREADS;
uint32_t MAX_NUMA_NODES;
uint32_t MAX_CORES_PER_NUMA_NODE;
@ -188,6 +191,24 @@ struct SWR_THREADING_INFO
bool SINGLE_THREADED;
};
//////////////////////////////////////////////////////////////////////////
/// SWR_API_THREADING_INFO
/// Data used to reserve HW threads for API use
/// API Threads are reserved from numa nodes / cores used for
/// SWR Worker threads. Specifying reserved threads here can reduce
/// the total number of SWR worker threads.
/////////////////////////////////////////////////////////////////////////
struct SWR_API_THREADING_INFO
{
uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0,
// binds thread used in SwrCreateContext to API Reserved
// thread 0
uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
// Independent of KNOB_MAX_THREADS_PER_CORE.
};
//////////////////////////////////////////////////////////////////////////
/// SWR_CREATECONTEXT_INFO
/////////////////////////////////////////////////////////////////////////
@ -219,6 +240,9 @@ struct SWR_CREATECONTEXT_INFO
// Input (optional): Threading info that overrides any set KNOB values.
SWR_THREADING_INFO* pThreadInfo;
// Input (optional}: Info for reserving API threads
SWR_API_THREADING_INFO* pApiThreadInfo;
// Input: if set to non-zero value, overrides KNOB value for maximum
// number of draws in flight
uint32_t MAX_DRAWS_IN_FLIGHT;
@ -236,6 +260,14 @@ SWR_FUNC(HANDLE, SwrCreateContext,
SWR_FUNC(void, SwrDestroyContext,
HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
/// @brief Bind current thread to an API reserved HW thread
/// @param hContext - Handle passed back from SwrCreateContext
/// @param apiThreadId - index of reserved HW thread to bind to.
SWR_FUNC(void, SwrBindApiThread,
HANDLE hContext,
uint32_t apiThreadId);
//////////////////////////////////////////////////////////////////////////
/// @brief Saves API state associated with hContext
/// @param hContext - Handle passed back from SwrCreateContext
@ -720,6 +752,7 @@ struct SWR_INTERFACE
{
PFNSwrCreateContext pfnSwrCreateContext;
PFNSwrDestroyContext pfnSwrDestroyContext;
PFNSwrBindApiThread pfnSwrBindApiThread;
PFNSwrSaveState pfnSwrSaveState;
PFNSwrRestoreState pfnSwrRestoreState;
PFNSwrSync pfnSwrSync;

View File

@ -480,6 +480,7 @@ struct SWR_CONTEXT
THREAD_POOL threadPool; // Thread pool associated with this context
SWR_THREADING_INFO threadInfo;
SWR_API_THREADING_INFO apiThreadInfo;
uint32_t MAX_DRAWS_IN_FLIGHT;

View File

@ -284,13 +284,20 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId =
{
// If MAX_WORKER_THREADS is set, only bind to the proc group,
// Not the individual HW thread.
if (!pContext->threadInfo.MAX_WORKER_THREADS)
if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
{
affinity.Mask = KAFFINITY(1) << threadId;
}
else
{
affinity.Mask = KAFFINITY(0);
}
}
SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
{
SWR_INVALID("Failed to set Thread Affinity");
}
#elif defined(__linux__) || defined(__gnu_linux__)
@ -727,6 +734,29 @@ void WorkOnCompute(
}
}
void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
{
if (nullptr == pContext)
{
return;
}
if (apiThreadId >= pContext->threadPool.numReservedThreads)
{
if (pContext->threadPool.numReservedThreads)
{
const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
// Just bind to the process group used for API thread 0
bindThread(pContext, 0, threadData.procGroupId, true);
}
return;
}
const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
}
template<bool IsFEThread, bool IsBEThread>
DWORD workerThreadMain(LPVOID pData)
{
@ -752,7 +782,8 @@ DWORD workerThreadMain(LPVOID pData)
RDTSC_INIT(threadId);
uint32_t numaNode = pThreadData->numaId;
// Only need offset numa index from base for correct masking
uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
uint32_t numaMask = pContext->threadPool.numaMask;
// flush denormals to 0
@ -861,28 +892,50 @@ DWORD workerThreadInit(LPVOID pData)
}
template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
{
// Initialize DRAW_CONTEXT's per-thread stats
for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
{
pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
}
}
//////////////////////////////////////////////////////////////////////////
/// @brief Creates thread pool info but doesn't launch threads.
/// @param pContext - pointer to context
/// @param pPool - pointer to thread pool object.
void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
{
bindThread(pContext, 0);
CPUNumaNodes nodes;
uint32_t numThreadsPerProcGroup = 0;
CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
// Assumption, for asymmetric topologies, multi-threaded cores will appear
// in the list before single-threaded cores. This appears to be true for
// Windows when the total HW threads is limited to 64.
uint32_t numHWNodes = (uint32_t)nodes.size();
uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
#if defined(_WIN32) && !defined(_WIN64)
if (!pContext->threadInfo.MAX_WORKER_THREADS)
{
// Limit 32-bit windows to bindable HW threads only
if ((numHWCoresPerNode * numHWHyperThreads) > 32)
{
numHWCoresPerNode = 32 / numHWHyperThreads;
}
}
#endif
// Calculate num HW threads. Due to asymmetric topologies, this is not
// a trivial multiplication.
uint32_t numHWThreads = 0;
for (auto& node : nodes)
for (auto const& node : nodes)
{
for (auto& core : node.cores)
for (auto const& core : node.cores)
{
numHWThreads += (uint32_t)core.threadIds.size();
}
@ -892,14 +945,19 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
uint32_t numCoresPerNode = numHWCoresPerNode;
uint32_t numHyperThreads = numHWHyperThreads;
if (pContext->threadInfo.MAX_NUMA_NODES)
// Calc used threads per-core
if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
{
numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
numHyperThreads -= pContext->threadInfo.BASE_THREAD;
}
if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
else
{
numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
SWR_ASSERT(
false,
"Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
pContext->threadInfo.BASE_THREAD,
numHyperThreads);
pContext->threadInfo.BASE_THREAD = 0;
}
if (pContext->threadInfo.MAX_THREADS_PER_CORE)
@ -907,93 +965,139 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
}
#if defined(_WIN32) && !defined(_WIN64)
if (!pContext->threadInfo.MAX_WORKER_THREADS)
// Prune any cores that don't support the number of threads
if (numHyperThreads > 1)
{
// Limit 32-bit windows to bindable HW threads only
if ((numCoresPerNode * numHWHyperThreads) > 32)
for (auto& node : nodes)
{
numCoresPerNode = 32 / numHWHyperThreads;
uint32_t numUsableCores = 0;
for (auto& core : node.cores)
{
numUsableCores += (core.threadIds.size() >= numHyperThreads);
}
numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
}
}
#endif
// Calculate numThreads
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
numThreads = std::min(numThreads, numHWThreads);
if (pContext->threadInfo.MAX_WORKER_THREADS)
// Calc used cores per NUMA node
if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
{
uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
}
uint32_t numAPIReservedThreads = 1;
if (numThreads == 1)
{
// If only 1 worker threads, try to move it to an available
// HW thread. If that fails, use the API thread.
if (numCoresPerNode < numHWCoresPerNode)
{
numCoresPerNode++;
}
else if (numHyperThreads < numHWHyperThreads)
{
numHyperThreads++;
}
else if (numNodes < numHWNodes)
{
numNodes++;
}
else
{
pContext->threadInfo.SINGLE_THREADED = true;
}
numCoresPerNode -= pContext->threadInfo.BASE_CORE;
}
else
{
// Save HW threads for the API if we can
if (numThreads > numAPIReservedThreads)
{
numThreads -= numAPIReservedThreads;
}
else
{
numAPIReservedThreads = 0;
}
SWR_ASSERT(
false,
"Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
pContext->threadInfo.BASE_CORE,
numCoresPerNode);
pContext->threadInfo.BASE_CORE = 0;
}
if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
{
numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
}
// Calc used NUMA nodes
if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
{
numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
}
else
{
SWR_ASSERT(
false,
"Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
pContext->threadInfo.BASE_NUMA_NODE,
numNodes);
pContext->threadInfo.BASE_NUMA_NODE = 0;
}
if (pContext->threadInfo.MAX_NUMA_NODES)
{
numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
}
// Calculate numThreads - at this point everything should be symmetric
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
SWR_REL_ASSERT(numThreads <= numHWThreads);
uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
uint32_t numRemovedThreads = 0;
if (pContext->threadInfo.SINGLE_THREADED)
{
numAPIReservedThreads = 0;
numThreads = 1;
}
// Initialize DRAW_CONTEXT's per-thread stats
for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
{
pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
}
if (pContext->threadInfo.SINGLE_THREADED)
{
pContext->NumWorkerThreads = 1;
pContext->NumFEThreads = 1;
pContext->NumBEThreads = 1;
pPool->numThreads = 0;
}
else if (pContext->threadInfo.MAX_WORKER_THREADS)
{
numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
pContext->threadInfo.BASE_NUMA_NODE = 0;
pContext->threadInfo.BASE_CORE = 0;
pContext->threadInfo.BASE_THREAD = 0;
numAPIReservedThreads = 0;
}
else
{
if (numAPIReservedThreads >= numThreads)
{
numAPIReservedThreads = 0;
}
else if (numAPIReservedThreads)
{
numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
if (0 == numAPIThreadsPerCore)
{
numAPIThreadsPerCore = numHWHyperThreads;
}
numRemovedThreads = numAPIReservedThreads;
if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
{
// Adjust removed threads to make logic below work
numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
}
numThreads -= numRemovedThreads;
}
}
InitPerThreadStats(pContext, numThreads);
if (pContext->threadInfo.SINGLE_THREADED)
{
return;
}
if (numAPIReservedThreads)
{
pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
SWR_ASSERT(pPool->pApiThreadData);
if (!pPool->pApiThreadData)
{
numAPIReservedThreads = 0;
}
}
pPool->numReservedThreads = numAPIReservedThreads;
pPool->numThreads = numThreads;
pContext->NumWorkerThreads = pPool->numThreads;
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
SWR_ASSERT(pPool->pThreadData);
pPool->numaMask = 0;
pPool->pThreads = new THREAD_PTR[pPool->numThreads];
pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
SWR_ASSERT(pPool->pThreads);
if (pContext->threadInfo.MAX_WORKER_THREADS)
{
@ -1021,37 +1125,72 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
// numa distribution assumes workers on all nodes
bool useNuma = true;
if (numCoresPerNode * numHyperThreads == 1)
{
useNuma = false;
}
if (useNuma) {
if (useNuma)
{
pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
} else {
}
else
{
pPool->numaMask = 0;
}
uint32_t workerId = 0;
uint32_t numReservedThreads = numAPIReservedThreads;
for (uint32_t n = 0; n < numNodes; ++n)
{
auto& node = nodes[n];
if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
{
break;
}
auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
uint32_t numCores = numCoresPerNode;
for (uint32_t c = 0; c < numCores; ++c)
{
if (c >= node.cores.size())
if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
{
break;
}
auto& core = node.cores[c];
auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
for (uint32_t t = 0; t < numHyperThreads; ++t)
{
if (t >= core.threadIds.size())
if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
{
break;
}
if (numAPIReservedThreads)
if (numRemovedThreads)
{
--numAPIReservedThreads;
--numRemovedThreads;
SWR_REL_ASSERT(numReservedThreads);
--numReservedThreads;
pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
pPool->pApiThreadData[numReservedThreads].pContext = pContext;
pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
{
--numReservedThreads;
pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
pPool->pApiThreadData[numReservedThreads].pContext = pContext;
pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
}
continue;
}
@ -1059,11 +1198,12 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
pPool->pThreadData[workerId].workerId = workerId;
pPool->pThreadData[workerId].procGroupId = core.procGroup;
pPool->pThreadData[workerId].threadId = core.threadIds[t];
pPool->pThreadData[workerId].numaId = useNuma ? n : 0;
pPool->pThreadData[workerId].coreId = c;
pPool->pThreadData[workerId].htId = t;
pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
pPool->pThreadData[workerId].pContext = pContext;
pPool->pThreadData[workerId].forceBindProcGroup = false;
pContext->NumBEThreads++;
pContext->NumFEThreads++;
@ -1113,9 +1253,10 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
delete(pPool->pThreads[t]);
}
delete [] pPool->pThreads;
delete[] pPool->pThreads;
// Clean up data used by threads
free(pPool->pThreadData);
delete[] pPool->pThreadData;
delete[] pPool->pApiThreadData;
}
}

View File

@ -55,6 +55,8 @@ struct THREAD_POOL
uint32_t numThreads;
uint32_t numaMask;
THREAD_DATA *pThreadData;
uint32_t numReservedThreads; // Number of threads reserved for API use
THREAD_DATA *pApiThreadData;
};
typedef std::unordered_set<uint32_t> TileSet;
@ -68,3 +70,5 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId);

View File

@ -100,7 +100,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
{
uint32_t size = numSamples * mHotTileSize[attachment];
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@ -124,7 +124,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
uint32_t size = numSamples * mHotTileSize[attachment];
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
}