swr: [rasterizer core] allow override of KNOB thread settings

- Remove HYPERTHREADED_FE support - Add threading info as optional data passed to SwrCreateContext. If supplied this data will override any KNOB thread settings. Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
2016-08-03 17:59:37 -06:00 · 2016-08-03 17:59:37 -06:00 · 29e1c4a8a9
parent e0c10306f5
commit 29e1c4a8a9
6 changed files with 53 additions and 70 deletions
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@ -75,6 +75,17 @@ HANDLE SwrCreateContext(
    pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
    pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);

+    pContext->threadInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
+    pContext->threadInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
+    pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
+    pContext->threadInfo.MAX_THREADS_PER_CORE      = KNOB_MAX_THREADS_PER_CORE;
+    pContext->threadInfo.SINGLE_THREADED           = KNOB_SINGLE_THREADED;
+
+    if (pCreateInfo->pThreadInfo)
+    {
+        pContext->threadInfo = *pCreateInfo->pThreadInfo;
+    }
+
    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
    {
        pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
@ -84,7 +95,7 @@ HANDLE SwrCreateContext(
        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
    }

-    if (!KNOB_SINGLE_THREADED)
+    if (!pContext->threadInfo.SINGLE_THREADED)
    {
        memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
        memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@ -95,9 +106,8 @@ HANDLE SwrCreateContext(
    }

    // Calling createThreadPool() above can set SINGLE_THREADED
-    if (KNOB_SINGLE_THREADED)
+    if (pContext->threadInfo.SINGLE_THREADED)
    {
-        SET_KNOB(HYPERTHREADED_FE, false);
        pContext->NumWorkerThreads = 1;
        pContext->NumFEThreads = 1;
        pContext->NumBEThreads = 1;
@ -218,7 +228,7 @@ void QueueWork(SWR_CONTEXT *pContext)
        pContext->dcRing.Enqueue();
    }

-    if (KNOB_SINGLE_THREADED)
+    if (pContext->threadInfo.SINGLE_THREADED)
    {
        // flush denormals to 0
        uint32_t mxcsr = _mm_getcsr();
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@ -90,6 +90,18 @@ typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,

 class BucketManager;

+//////////////////////////////////////////////////////////////////////////
+/// SWR_THREADING_INFO
+/////////////////////////////////////////////////////////////////////////
+struct SWR_THREADING_INFO
+{
+    uint32_t    MAX_WORKER_THREADS;
+    uint32_t    MAX_NUMA_NODES;
+    uint32_t    MAX_CORES_PER_NUMA_NODE;
+    uint32_t    MAX_THREADS_PER_CORE;
+    bool        SINGLE_THREADED;
+};
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_CREATECONTEXT_INFO
 /////////////////////////////////////////////////////////////////////////
@ -113,6 +125,9 @@ struct SWR_CREATECONTEXT_INFO

    // Output: size required memory passed to for SwrSaveState / SwrRestoreState
    size_t  contextSaveSize;
+
+    // Input (optional): Threading info that overrides any set KNOB values.
+    SWR_THREADING_INFO* pThreadInfo;
 };

 //////////////////////////////////////////////////////////////////////////
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@ -464,6 +464,7 @@ struct SWR_CONTEXT
    uint32_t NumBEThreads;

    THREAD_POOL threadPool; // Thread pool associated with this context
+    SWR_THREADING_INFO threadInfo;

    std::condition_variable FifosNotEmpty;
    std::mutex WaitLock;
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -239,10 +239,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
 }


-void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
+void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
 {
    // Only bind threads when MAX_WORKER_THREADS isn't set.
-    if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false)
+    if (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false)
    {
        return;
    }
@ -267,9 +267,9 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
    else
 #endif
    {
-        // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group,
+        // If MAX_WORKER_THREADS is set, only bind to the proc group,
        // Not the individual HW thread.
-        if (!KNOB_MAX_WORKER_THREADS)
+        if (!pContext->threadInfo.MAX_WORKER_THREADS)
        {
            affinity.Mask = KAFFINITY(1) << threadId;
        }
@ -648,7 +648,7 @@ DWORD workerThreadMain(LPVOID pData)
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

-    bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 
+    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 

    RDTSC_INIT(threadId);

@ -771,7 +771,7 @@ template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;

 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
-    bindThread(0);
+    bindThread(pContext, 0);

    CPUNumaNodes nodes;
    uint32_t numThreadsPerProcGroup = 0;
@ -796,33 +796,23 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
    uint32_t numCoresPerNode    = numHWCoresPerNode;
    uint32_t numHyperThreads    = numHWHyperThreads;

-    if (KNOB_MAX_WORKER_THREADS)
+    if (pContext->threadInfo.MAX_NUMA_NODES)
    {
-        SET_KNOB(HYPERTHREADED_FE, false);
+        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
    }

-    if (KNOB_HYPERTHREADED_FE)
+    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
    {
-        SET_KNOB(MAX_THREADS_PER_CORE, 0);
+        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
    }

-    if (KNOB_MAX_NUMA_NODES)
+    if (pContext->threadInfo.MAX_THREADS_PER_CORE)
    {
-        numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
-    }
-
-    if (KNOB_MAX_CORES_PER_NUMA_NODE)
-    {
-        numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
-    }
-
-    if (KNOB_MAX_THREADS_PER_CORE)
-    {
-        numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
+        numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
    }

 #if defined(_WIN32) && !defined(_WIN64)
-    if (!KNOB_MAX_WORKER_THREADS)
+    if (!pContext->threadInfo.MAX_WORKER_THREADS)
    {
        // Limit 32-bit windows to bindable HW threads only
        if ((numCoresPerNode * numHWHyperThreads) > 32)
@ -832,19 +822,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
    }
 #endif

-    if (numHyperThreads < 2)
-    {
-        SET_KNOB(HYPERTHREADED_FE, false);
-    }
-
    // Calculate numThreads
    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
    numThreads = std::min(numThreads, numHWThreads);

-    if (KNOB_MAX_WORKER_THREADS)
+    if (pContext->threadInfo.MAX_WORKER_THREADS)
    {
        uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
-        numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads);
+        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
    }

    if (numThreads > KNOB_MAX_NUM_THREADS)
@ -900,7 +885,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
    pPool->numaMask = 0;

-    if (KNOB_MAX_WORKER_THREADS)
+    if (pContext->threadInfo.MAX_WORKER_THREADS)
    {
        bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
        uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
@ -962,25 +947,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                    pPool->pThreadData[workerId].htId = t;
                    pPool->pThreadData[workerId].pContext = pContext;

-                    if (KNOB_HYPERTHREADED_FE)
-                    {
-                        if (t == 0)
-                        {
-                            pContext->NumBEThreads++;
-                            pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
-                        }
-                        else
-                        {
-                            pContext->NumFEThreads++;
-                            pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
-                        }
-                    }
-                    else
-                    {
-                        pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
-                        pContext->NumBEThreads++;
-                        pContext->NumFEThreads++;
-                    }
+                    pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+                    pContext->NumBEThreads++;
+                    pContext->NumFEThreads++;

                    ++workerId;
                }
@ -991,7 +960,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)

 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
-    if (!KNOB_SINGLE_THREADED)
+    if (!pContext->threadInfo.SINGLE_THREADED)
    {
        // Inform threads to finish up
        std::unique_lock<std::mutex> lock(pContext->WaitLock);
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -45,7 +45,7 @@ struct THREAD_DATA
    uint32_t htId;          // Hyperthread id
    uint32_t workerId;
    SWR_CONTEXT *pContext;
-    bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
+    bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set.
 };


--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@ -30,18 +30,6 @@ KNOBS = [
        'category'  : 'debug',
    }],

-    ['HYPERTHREADED_FE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['EXPERIMENTAL!!',
-                       'If enabled will attempt to use secondary threads per core to perform',
-                       'front-end (VS/GS) work.',
-                       '',
-                       'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
-        'category'  : 'perf',
-        'advanced'  : 'true',
-    }],
-
    ['DUMP_SHADER_IR', {
        'type'      : 'bool',
        'default'   : 'false',