swr/rast: Rework thread binding parameters for machine partitioning

Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to SwrCreateContext. Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to control reservation of API threads. Add SwrBindApiThread() function to allow binding of API threads to reserved HW threads. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
2017-12-11 17:45:58 -06:00 · 2017-12-11 17:45:58 -06:00 · 20f9006603
parent 182cc51a50
commit 20f9006603
7 changed files with 329 additions and 95 deletions
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@ -62,15 +62,33 @@ KNOBS = [
        'category'  : 'perf',
    }],

-    ['MAX_NUMA_NODES', {
+    ['BASE_NUMA_NODE', {
        'type'      : 'uint32_t',
        'default'   : '0',
+        'desc'      : ['Starting NUMA node index to use when allocating compute resources.',
+                       'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
+        'category'  : 'perf',
+        'advanced'  : True,
+    }],
+
+    ['MAX_NUMA_NODES', {
+        'type'      : 'uint32_t',
+        'default'   : '1' if sys.platform == 'win32' else '0',
        'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
                       '  0 == ALL NUMA-nodes in the system',
                       '  N == Use at most N NUMA-nodes for rendering'],
        'category'  : 'perf',
    }],

+    ['BASE_CORE', {
+        'type'      : 'uint32_t',
+        'default'   : '0',
+        'desc'      : ['Starting core index to use when allocating compute resources.',
+                       'Setting this to a non-zero value will reduce the maximum # of cores used.'],
+        'category'  : 'perf',
+        'advanced'  : True,
+    }],
+
    ['MAX_CORES_PER_NUMA_NODE', {
        'type'      : 'uint32_t',
        'default'   : '0',
@ -80,6 +98,15 @@ KNOBS = [
        'category'  : 'perf',
    }],

+    ['BASE_THREAD', {
+        'type'      : 'uint32_t',
+        'default'   : '0',
+        'desc'      : ['Starting thread index to use when allocating compute resources.',
+                       'Setting this to a non-zero value will reduce the maximum # of threads used.'],
+        'category'  : 'perf',
+        'advanced'  : True,
+    }],
+
    ['MAX_THREADS_PER_CORE', {
        'type'      : 'uint32_t',
        'default'   : '1',
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@ -95,16 +95,32 @@ HANDLE SwrCreateContext(
        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
    }

-    pContext->threadInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
-    pContext->threadInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
-    pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
-    pContext->threadInfo.MAX_THREADS_PER_CORE      = KNOB_MAX_THREADS_PER_CORE;
-    pContext->threadInfo.SINGLE_THREADED           = KNOB_SINGLE_THREADED;
-
    if (pCreateInfo->pThreadInfo)
    {
        pContext->threadInfo = *pCreateInfo->pThreadInfo;
    }
+    else
+    {
+        pContext->threadInfo.MAX_WORKER_THREADS         = KNOB_MAX_WORKER_THREADS;
+        pContext->threadInfo.BASE_NUMA_NODE             = KNOB_BASE_NUMA_NODE;
+        pContext->threadInfo.BASE_CORE                  = KNOB_BASE_CORE;
+        pContext->threadInfo.BASE_THREAD                = KNOB_BASE_THREAD;
+        pContext->threadInfo.MAX_NUMA_NODES             = KNOB_MAX_NUMA_NODES;
+        pContext->threadInfo.MAX_CORES_PER_NUMA_NODE    = KNOB_MAX_CORES_PER_NUMA_NODE;
+        pContext->threadInfo.MAX_THREADS_PER_CORE       = KNOB_MAX_THREADS_PER_CORE;
+        pContext->threadInfo.SINGLE_THREADED            = KNOB_SINGLE_THREADED;
+    }
+
+    if (pCreateInfo->pApiThreadInfo)
+    {
+        pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
+    }
+    else
+    {
+        pContext->apiThreadInfo.bindAPIThread0          = true;
+        pContext->apiThreadInfo.numAPIReservedThreads   = 1;
+        pContext->apiThreadInfo.numAPIThreadsPerCore    = 1;
+    }

    memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
    memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@ -113,6 +129,11 @@ HANDLE SwrCreateContext(

    CreateThreadPool(pContext, &pContext->threadPool);

+    if (pContext->apiThreadInfo.bindAPIThread0)
+    {
+        BindApiThread(pContext, 0);
+    }
+
    pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
    pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);

@ -407,6 +428,12 @@ void SwrDestroyContext(HANDLE hContext)
    AlignedFree(GetContext(hContext));
 }

+void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    BindApiThread(pContext, apiThreadId);
+}
+
 void SWR_API SwrSaveState(
    HANDLE hContext,
    void* pOutputStateBlock,
@ -1688,6 +1715,7 @@ void SwrGetInterface(SWR_INTERFACE &out_funcs)
 {
    out_funcs.pfnSwrCreateContext = SwrCreateContext;
    out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
+    out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
    out_funcs.pfnSwrSaveState = SwrSaveState;
    out_funcs.pfnSwrRestoreState = SwrRestoreState;
    out_funcs.pfnSwrSync = SwrSync;
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@ -181,6 +181,9 @@ class BucketManager;
 /////////////////////////////////////////////////////////////////////////
 struct SWR_THREADING_INFO
 {
+    uint32_t    BASE_NUMA_NODE;
+    uint32_t    BASE_CORE;
+    uint32_t    BASE_THREAD;
    uint32_t    MAX_WORKER_THREADS;
    uint32_t    MAX_NUMA_NODES;
    uint32_t    MAX_CORES_PER_NUMA_NODE;
@ -188,6 +191,24 @@ struct SWR_THREADING_INFO
    bool        SINGLE_THREADED;
 };

+//////////////////////////////////////////////////////////////////////////
+/// SWR_API_THREADING_INFO
+/// Data used to reserve HW threads for API use
+/// API Threads are reserved from numa nodes / cores used for
+/// SWR Worker threads.  Specifying reserved threads here can reduce
+/// the total number of SWR worker threads.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_API_THREADING_INFO
+{
+    uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
+    uint32_t bindAPIThread0;        // Default is true if numAPIReservedThreads is > 0,
+                                    // binds thread used in SwrCreateContext to API Reserved
+                                    // thread 0
+    uint32_t numAPIThreadsPerCore;  // 0 - means use all threads per core, else clamp to this number.
+                                    // Independent of KNOB_MAX_THREADS_PER_CORE.
+};
+
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_CREATECONTEXT_INFO
 /////////////////////////////////////////////////////////////////////////
@ -219,6 +240,9 @@ struct SWR_CREATECONTEXT_INFO
    // Input (optional): Threading info that overrides any set KNOB values.
    SWR_THREADING_INFO* pThreadInfo;

+    // Input (optional}: Info for reserving API threads
+    SWR_API_THREADING_INFO* pApiThreadInfo;
+
    // Input: if set to non-zero value, overrides KNOB value for maximum
    // number of draws in flight
    uint32_t MAX_DRAWS_IN_FLIGHT;
@ -236,6 +260,14 @@ SWR_FUNC(HANDLE, SwrCreateContext,
 SWR_FUNC(void, SwrDestroyContext,
    HANDLE hContext);

+//////////////////////////////////////////////////////////////////////////
+/// @brief Bind current thread to an API reserved HW thread
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param apiThreadId - index of reserved HW thread to bind to.
+SWR_FUNC(void, SwrBindApiThread,
+    HANDLE hContext,
+    uint32_t apiThreadId);
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Saves API state associated with hContext
 /// @param hContext - Handle passed back from SwrCreateContext
@ -720,6 +752,7 @@ struct SWR_INTERFACE
 {
    PFNSwrCreateContext pfnSwrCreateContext;
    PFNSwrDestroyContext pfnSwrDestroyContext;
+    PFNSwrBindApiThread pfnSwrBindApiThread;
    PFNSwrSaveState pfnSwrSaveState;
    PFNSwrRestoreState pfnSwrRestoreState;
    PFNSwrSync pfnSwrSync;
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@ -480,6 +480,7 @@ struct SWR_CONTEXT

    THREAD_POOL threadPool; // Thread pool associated with this context
    SWR_THREADING_INFO threadInfo;
+    SWR_API_THREADING_INFO apiThreadInfo;

    uint32_t MAX_DRAWS_IN_FLIGHT;

--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@ -284,13 +284,20 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId =
    {
        // If MAX_WORKER_THREADS is set, only bind to the proc group,
        // Not the individual HW thread.
-        if (!pContext->threadInfo.MAX_WORKER_THREADS)
+        if (!bindProcGroup  && !pContext->threadInfo.MAX_WORKER_THREADS)
        {
            affinity.Mask = KAFFINITY(1) << threadId;
        }
+        else
+        {
+            affinity.Mask = KAFFINITY(0);
+        }
    }

-    SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
+    if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
+    {
+        SWR_INVALID("Failed to set Thread Affinity");
+    }

 #elif defined(__linux__) || defined(__gnu_linux__)

@ -727,6 +734,29 @@ void WorkOnCompute(
    }
 }

+void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
+{
+    if (nullptr == pContext)
+    {
+        return;
+    }
+
+    if (apiThreadId >= pContext->threadPool.numReservedThreads)
+    {
+        if (pContext->threadPool.numReservedThreads)
+        {
+            const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
+            // Just bind to the process group used for API thread 0
+            bindThread(pContext, 0, threadData.procGroupId, true);
+        }
+        return;
+    }
+
+    const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
+
+    bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
+}
+
 template<bool IsFEThread, bool IsBEThread>
 DWORD workerThreadMain(LPVOID pData)
 {
@ -752,7 +782,8 @@ DWORD workerThreadMain(LPVOID pData)

    RDTSC_INIT(threadId);

-    uint32_t numaNode = pThreadData->numaId;
+    // Only need offset numa index from base for correct masking
+    uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
    uint32_t numaMask = pContext->threadPool.numaMask;

    // flush denormals to 0
@ -861,28 +892,50 @@ DWORD workerThreadInit(LPVOID pData)
 }
 template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;

+static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
+{
+    // Initialize DRAW_CONTEXT's per-thread stats
+    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
+    {
+        pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
+        memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Creates thread pool info but doesn't launch threads.
 /// @param pContext - pointer to context
 /// @param pPool - pointer to thread pool object.
 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 {
-    bindThread(pContext, 0);
-
    CPUNumaNodes nodes;
    uint32_t numThreadsPerProcGroup = 0;
    CalculateProcessorTopology(nodes, numThreadsPerProcGroup);

+    // Assumption, for asymmetric topologies, multi-threaded cores will appear
+    // in the list before single-threaded cores.  This appears to be true for
+    // Windows when the total HW threads is limited to 64.
    uint32_t numHWNodes         = (uint32_t)nodes.size();
    uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
    uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();

+#if defined(_WIN32) && !defined(_WIN64)
+    if (!pContext->threadInfo.MAX_WORKER_THREADS)
+    {
+        // Limit 32-bit windows to bindable HW threads only
+        if ((numHWCoresPerNode * numHWHyperThreads) > 32)
+        {
+            numHWCoresPerNode = 32 / numHWHyperThreads;
+        }
+    }
+#endif
+
    // Calculate num HW threads.  Due to asymmetric topologies, this is not
    // a trivial multiplication.
    uint32_t numHWThreads = 0;
-    for (auto& node : nodes)
+    for (auto const& node : nodes)
    {
-        for (auto& core : node.cores)
+        for (auto const& core : node.cores)
        {
            numHWThreads += (uint32_t)core.threadIds.size();
        }
@ -892,14 +945,19 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
    uint32_t numCoresPerNode    = numHWCoresPerNode;
    uint32_t numHyperThreads    = numHWHyperThreads;

-    if (pContext->threadInfo.MAX_NUMA_NODES)
+    // Calc used threads per-core
+    if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
    {
-        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
+        numHyperThreads -= pContext->threadInfo.BASE_THREAD;
    }
-
-    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
+    else
    {
-        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
+        SWR_ASSERT(
+            false,
+            "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
+            pContext->threadInfo.BASE_THREAD,
+            numHyperThreads);
+        pContext->threadInfo.BASE_THREAD = 0;
    }

    if (pContext->threadInfo.MAX_THREADS_PER_CORE)
@ -907,93 +965,139 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
        numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
    }

-#if defined(_WIN32) && !defined(_WIN64)
-    if (!pContext->threadInfo.MAX_WORKER_THREADS)
+    // Prune any cores that don't support the number of threads
+    if (numHyperThreads > 1)
    {
-        // Limit 32-bit windows to bindable HW threads only
-        if ((numCoresPerNode * numHWHyperThreads) > 32)
+        for (auto& node : nodes)
        {
-            numCoresPerNode = 32 / numHWHyperThreads;
+            uint32_t numUsableCores = 0;
+            for (auto& core : node.cores)
+            {
+                numUsableCores += (core.threadIds.size() >= numHyperThreads);
+            }
+            numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
        }
    }
-#endif

-    // Calculate numThreads
-    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
-    numThreads = std::min(numThreads, numHWThreads);
-
-    if (pContext->threadInfo.MAX_WORKER_THREADS)
+    // Calc used cores per NUMA node
+    if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
    {
-        uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
-        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
-    }
-
-    uint32_t numAPIReservedThreads = 1;
-
-
-    if (numThreads == 1)
-    {
-        // If only 1 worker threads, try to move it to an available
-        // HW thread.  If that fails, use the API thread.
-        if (numCoresPerNode < numHWCoresPerNode)
-        {
-            numCoresPerNode++;
-        }
-        else if (numHyperThreads < numHWHyperThreads)
-        {
-            numHyperThreads++;
-        }
-        else if (numNodes < numHWNodes)
-        {
-            numNodes++;
-        }
-        else
-        {
-            pContext->threadInfo.SINGLE_THREADED = true;
-        }
+        numCoresPerNode -= pContext->threadInfo.BASE_CORE;
    }
    else
    {
-        // Save HW threads for the API if we can
-        if (numThreads > numAPIReservedThreads)
-        {
-            numThreads -= numAPIReservedThreads;
-        }
-        else
-        {
-            numAPIReservedThreads = 0;
-        }
+        SWR_ASSERT(
+            false,
+            "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
+            pContext->threadInfo.BASE_CORE,
+            numCoresPerNode);
+        pContext->threadInfo.BASE_CORE = 0;
    }

+    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
+    {
+        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
+    }
+
+    // Calc used NUMA nodes
+    if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
+    {
+        numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
+    }
+    else
+    {
+        SWR_ASSERT(
+            false,
+            "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
+            pContext->threadInfo.BASE_NUMA_NODE,
+            numNodes);
+        pContext->threadInfo.BASE_NUMA_NODE = 0;
+    }
+
+    if (pContext->threadInfo.MAX_NUMA_NODES)
+    {
+        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
+    }
+
+    // Calculate numThreads - at this point everything should be symmetric
+    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+    SWR_REL_ASSERT(numThreads <= numHWThreads);
+
+    uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
+    uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
+    uint32_t numRemovedThreads = 0;
+
    if (pContext->threadInfo.SINGLE_THREADED)
    {
+        numAPIReservedThreads = 0;
        numThreads = 1;
-    }
-
-    // Initialize DRAW_CONTEXT's per-thread stats
-    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
-        memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
-    }
-
-    if (pContext->threadInfo.SINGLE_THREADED)
-    {
        pContext->NumWorkerThreads = 1;
        pContext->NumFEThreads = 1;
        pContext->NumBEThreads = 1;
        pPool->numThreads = 0;
+    }
+    else if (pContext->threadInfo.MAX_WORKER_THREADS)
+    {
+        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
+        pContext->threadInfo.BASE_NUMA_NODE = 0;
+        pContext->threadInfo.BASE_CORE = 0;
+        pContext->threadInfo.BASE_THREAD = 0;
+        numAPIReservedThreads = 0;
+    }
+    else
+    {
+        if (numAPIReservedThreads >= numThreads)
+        {
+            numAPIReservedThreads = 0;
+        }
+        else if (numAPIReservedThreads)
+        {
+            numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);

+            if (0 == numAPIThreadsPerCore)
+            {
+                numAPIThreadsPerCore = numHWHyperThreads;
+            }
+
+            numRemovedThreads = numAPIReservedThreads;
+            if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
+            {
+                // Adjust removed threads to make logic below work
+                numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
+            }
+
+            numThreads -= numRemovedThreads;
+        }
+    }
+
+    InitPerThreadStats(pContext, numThreads);
+
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
        return;
    }

+    if (numAPIReservedThreads)
+    {
+        pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
+        SWR_ASSERT(pPool->pApiThreadData);
+        if (!pPool->pApiThreadData)
+        {
+            numAPIReservedThreads = 0;
+        }
+    }
+    pPool->numReservedThreads = numAPIReservedThreads;
+
    pPool->numThreads = numThreads;
    pContext->NumWorkerThreads = pPool->numThreads;

-    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+    pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
+    SWR_ASSERT(pPool->pThreadData);
    pPool->numaMask = 0;

-    pPool->pThreads = new THREAD_PTR[pPool->numThreads];
+
+    pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
+    SWR_ASSERT(pPool->pThreads);

    if (pContext->threadInfo.MAX_WORKER_THREADS)
    {
@ -1021,37 +1125,72 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
        // numa distribution assumes workers on all nodes
        bool useNuma = true;
        if (numCoresPerNode * numHyperThreads == 1)
+        {
            useNuma = false;
+        }

-        if (useNuma) {
+        if (useNuma)
+        {
            pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
-        } else {
+        }
+        else
+        {
            pPool->numaMask = 0;
        }

        uint32_t workerId = 0;
+        uint32_t numReservedThreads = numAPIReservedThreads;
        for (uint32_t n = 0; n < numNodes; ++n)
        {
-            auto& node = nodes[n];
+            if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
+            {
+                break;
+            }
+            auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
            uint32_t numCores = numCoresPerNode;
            for (uint32_t c = 0; c < numCores; ++c)
            {
-                if (c >= node.cores.size())
+                if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
                {
                    break;
                }

-                auto& core = node.cores[c];
+                auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
                for (uint32_t t = 0; t < numHyperThreads; ++t)
                {
-                    if (t >= core.threadIds.size())
+                    if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
                    {
                        break;
                    }

-                    if (numAPIReservedThreads)
+                    if (numRemovedThreads)
                    {
-                        --numAPIReservedThreads;
+                        --numRemovedThreads;
+                        SWR_REL_ASSERT(numReservedThreads);
+                        --numReservedThreads;
+                        pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+                        pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
+                        pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
+                        pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                        pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
+                        pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
+                        pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+                        pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
+
+
+                        if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
+                        {
+                            --numReservedThreads;
+                            pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+                            pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
+                            pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
+                            pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                            pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
+                            pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
+                            pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+                            pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
+                        }
+
                        continue;
                    }

@ -1059,11 +1198,12 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)

                    pPool->pThreadData[workerId].workerId = workerId;
                    pPool->pThreadData[workerId].procGroupId = core.procGroup;
-                    pPool->pThreadData[workerId].threadId = core.threadIds[t];
-                    pPool->pThreadData[workerId].numaId = useNuma ? n : 0;
-                    pPool->pThreadData[workerId].coreId = c;
-                    pPool->pThreadData[workerId].htId = t;
+                    pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
+                    pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                    pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
+                    pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
                    pPool->pThreadData[workerId].pContext = pContext;
+                    pPool->pThreadData[workerId].forceBindProcGroup = false;

                    pContext->NumBEThreads++;
                    pContext->NumFEThreads++;
@ -1113,9 +1253,10 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
            delete(pPool->pThreads[t]);
        }

-        delete [] pPool->pThreads;
+        delete[] pPool->pThreads;

        // Clean up data used by threads
-        free(pPool->pThreadData);
+        delete[] pPool->pThreadData;
+        delete[] pPool->pApiThreadData;
    }
 }
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@ -55,6 +55,8 @@ struct THREAD_POOL
    uint32_t numThreads;
    uint32_t numaMask;
    THREAD_DATA *pThreadData;
+    uint32_t numReservedThreads; // Number of threads reserved for API use
+    THREAD_DATA *pApiThreadData;
 };

 typedef std::unordered_set<uint32_t> TileSet;
@ -68,3 +70,5 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
 bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
+
+void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId);
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@ -100,7 +100,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
        {
            uint32_t size = numSamples * mHotTileSize[attachment];
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
            hotTile.state = HOTTILE_INVALID;
            hotTile.numSamples = numSamples;
            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@ -124,7 +124,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32

            uint32_t size = numSamples * mHotTileSize[attachment];
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
            hotTile.state = HOTTILE_INVALID;
            hotTile.numSamples = numSamples;
        }