swr/rasterizer: Add memory tracking support

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Jan Zielinski 2019-07-26 09:37:12 +02:00
parent 5dd9ad1570
commit 4d2890e8f7
12 changed files with 252 additions and 26 deletions

View File

@ -26,6 +26,7 @@
*
******************************************************************************/
#include <atomic>
#include <map>
#include "common/os.h"
#include "archrast/archrast.h"
@ -85,6 +86,74 @@ namespace ArchRast
uint32_t alphaBlendCount = 0;
};
struct MemoryStats
{
struct MemoryTrackerKey
{
uint64_t address;
uint64_t mask;
};
struct MemoryTrackerData
{
uint32_t accessCountRead;
uint32_t accessCountWrite;
uint64_t tscMin;
uint64_t tscMax;
};
struct AddressRangeComparator
{
bool operator()(MemoryTrackerKey a, MemoryTrackerKey b) const
{
return (a.address & a.mask) < (b.address & b.mask);
}
};
typedef std::map<MemoryTrackerKey, MemoryTrackerData, AddressRangeComparator> MemoryTrackerMap;
MemoryTrackerMap trackedMemory = {};
void TrackMemoryAccess(uint64_t address, uint64_t addressMask, uint8_t isRead, uint64_t tsc)
{
MemoryTrackerKey key;
key.address = address;
key.mask = addressMask;
MemoryTrackerMap::iterator i = trackedMemory.lower_bound(key);
if (i != trackedMemory.end() && !(trackedMemory.key_comp()(key, i->first)))
{
// already in map
if (isRead)
{
i->second.accessCountRead++;
}
else
{
i->second.accessCountWrite++;
}
i->second.tscMax = tsc;
}
else
{
// new entry
MemoryTrackerData data;
if (isRead)
{
data.accessCountRead = 1;
data.accessCountWrite = 0;
}
else
{
data.accessCountRead = 0;
data.accessCountWrite = 1;
}
data.tscMin = tsc;
data.tscMax = tsc;
trackedMemory.insert(i, MemoryTrackerMap::value_type(key, data));
}
}
};
//////////////////////////////////////////////////////////////////////////
/// @brief Event handler that handles API thread events. This is shared
/// between the API and its caller (e.g. driver shim) but typically
@ -180,6 +249,16 @@ namespace ArchRast
EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
{
memset(mShaderStats, 0, sizeof(mShaderStats));
// compute address mask for memory tracking
mAddressMask = 0;
uint64_t addressRangeBytes = 64;
while (addressRangeBytes > 0)
{
mAddressMask = (mAddressMask << 1) | 1;
addressRangeBytes = addressRangeBytes >> 1;
}
mAddressMask = ~mAddressMask;
}
virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
@ -585,6 +664,28 @@ namespace ArchRast
mGS = {};
}
virtual void Handle(const MemoryAccessEvent& event)
{
mMemoryStats.TrackMemoryAccess(event.data.ptr, mAddressMask, event.data.isRead, event.data.tsc);
}
virtual void Handle(const MemoryStatsEndEvent& event)
{
MemoryStats::MemoryTrackerMap::iterator i = mMemoryStats.trackedMemory.begin();
while (i != mMemoryStats.trackedMemory.end())
{
MemoryStatsEvent mse(event.data.drawId,
i->first.address & mAddressMask,
i->second.accessCountRead,
i->second.accessCountWrite,
i->second.tscMin,
i->second.tscMax);
EventHandlerFile::Handle(mse);
i++;
}
mMemoryStats.trackedMemory.clear();
}
virtual void Handle(const GSPrimInfo& event)
{
mGS.inputPrimCount += event.data.inputPrimCount;
@ -631,6 +732,9 @@ namespace ArchRast
SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];
MemoryStats mMemoryStats = {};
uint64_t mAddressMask = 0;
};
static EventManager* FromHandle(HANDLE hThreadContext)

View File

@ -463,4 +463,23 @@ event SWTagFlushEvent
uint32_t swTagFlushCounter;
char swTagFlushReason[256];
uint32_t swTagFlushType;
};
};
event SWTagApiCallEvent
{
uint64_t swTagFrame;
uint32_t swTagDrawOrDispatch;
uint32_t swTagDraw;
uint32_t swTagDispatch;
char swTagApiCall[256];
};
event MemoryStatsEvent
{
uint32_t drawId;
uint64_t baseAddr;
uint32_t accessCountRead;
uint32_t accessCountWrite;
uint64_t tscMin;
uint64_t tscMax;
};

View File

@ -90,6 +90,21 @@ event FrontendDrawEndEvent
uint32_t drawId;
};
event MemoryAccessEvent
{
uint32_t drawId;
uint64_t tsc;
uint64_t ptr;
uint32_t size;
uint8_t isRead;
uint8_t client;
};
event MemoryStatsEndEvent
{
uint32_t drawId;
};
event TessPrimCount
{
uint64_t primCount;

View File

@ -181,7 +181,12 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
#if defined(KNOB_ENABLE_AR)
// Initialize worker thread context for ArchRast.
pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
pWorkerData->hArContext = pContext->pArContext[i];
#endif
}
#if defined(KNOB_ENABLE_AR)

View File

@ -219,10 +219,17 @@ struct SWR_API_THREADING_INFO
// Independent of KNOB_MAX_THREADS_PER_CORE.
};
struct SWR_WORKER_DATA
{
HANDLE hArContext; // handle to the archrast context
};
//////////////////////////////////////////////////////////////////////////
/// SWR_WORKER_PRIVATE_STATE
/// Data used to allocate per-worker thread private data. A pointer
/// to this data will be passed in to each shader function.
/// The first field of this private data must be SWR_WORKER_DATA
/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA
/////////////////////////////////////////////////////////////////////////
struct SWR_WORKER_PRIVATE_STATE
{

View File

@ -520,6 +520,8 @@ static void StreamOut(
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
const API_STATE& state = GetApiState(pDC);
const SWR_STREAMOUT_STATE& soState = state.soState;
@ -575,7 +577,7 @@ static void StreamOut(
// Call SOS
SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
"Trying to execute uninitialized streamout jit function.");
state.pfnSoFunc[streamIndex](GetPrivateState(pDC), soContext);
state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
}
// Update SO write offset. The driver provides memory for the update.

View File

@ -233,6 +233,7 @@ struct SWR_SHADER_STATS
uint32_t numLodExecuted;
};
//////////////////////////////////////////////////////////////////////////
/// SWR_VS_CONTEXT
/// @brief Input to vertex shader
@ -905,7 +906,7 @@ typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateDat
typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext);
typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext);
typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext);
typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*);

View File

@ -458,6 +458,9 @@ INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId,
{
ExecuteCallbacks(pContext, workerId, pDC);
// Report accumulated memory access stats
AR_EVENT(MemoryStatsEndEvent(pDC->drawId));
// Cleanup memory allocations
pDC->pArena->Reset(true);
if (!pDC->isCompute)
@ -1193,26 +1196,31 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
// Allocate worker private data
pPool->pWorkerPrivateDataArray = nullptr;
if (pContext->workerPrivateState.perWorkerPrivateStateSize)
if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
{
size_t perWorkerSize =
AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
size_t totalSize = perWorkerSize * pPool->numThreads;
if (totalSize)
{
pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
SWR_ASSERT(pPool->pWorkerPrivateDataArray);
pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
pContext->workerPrivateState.pfnInitWorkerData = nullptr;
pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
}
// initialize contents of SWR_WORKER_DATA
size_t perWorkerSize =
AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
size_t totalSize = perWorkerSize * pPool->numThreads;
if (totalSize)
{
pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
SWR_ASSERT(pPool->pWorkerPrivateDataArray);
void* pWorkerData = pPool->pWorkerPrivateDataArray;
for (uint32_t i = 0; i < pPool->numThreads; ++i)
void* pWorkerData = pPool->pWorkerPrivateDataArray;
for (uint32_t i = 0; i < pPool->numThreads; ++i)
{
pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
if (pContext->workerPrivateState.pfnInitWorkerData)
{
pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
if (pContext->workerPrivateState.pfnInitWorkerData)
{
pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
}
pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
}
pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
}
}

View File

@ -42,7 +42,9 @@ namespace SwrJit
mpTranslationFuncTy = nullptr;
mpfnTranslateGfxAddressForRead = nullptr;
mpfnTranslateGfxAddressForWrite = nullptr;
mpfnTrackMemAccess = nullptr;
mpParamSimDC = nullptr;
mpWorkerData = nullptr;
}
@ -167,9 +169,57 @@ namespace SwrJit
return Ptr;
}
void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead)
{
#if defined(KNOB_ENABLE_AR)
if (!KNOB_TRACK_MEMORY_WORKING_SET)
{
return;
}
Value* tmpPtr;
// convert actual pointers to int64.
uint32_t size = 0;
if (Ptr->getType() == mInt64Ty)
{
DataLayout dataLayout(JM()->mpCurrentModule);
size = (uint32_t)dataLayout.getTypeAllocSize(Ty);
tmpPtr = Ptr;
}
else
{
DataLayout dataLayout(JM()->mpCurrentModule);
size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType());
tmpPtr = PTR_TO_INT(Ptr, mInt64Ty);
}
// There are some shader compile setups where there's no translation functions set up.
// This would be a situation where the accesses are to internal rasterizer memory and won't
// be logged.
// TODO: we may wish to revisit this for URB reads/writes, though.
if (mpfnTrackMemAccess)
{
SWR_ASSERT(mpWorkerData != nullptr);
CALL(mpfnTrackMemAccess,
{mpParamSimDC,
mpWorkerData,
tmpPtr,
C((uint32_t)size),
C((uint8_t)isRead),
C((uint32_t)usage)});
}
#endif
return;
}
LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, Name);
@ -178,6 +228,7 @@ namespace SwrJit
LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, Name);
@ -188,6 +239,7 @@ namespace SwrJit
Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, isVolatile, Name);
@ -232,6 +284,7 @@ namespace SwrJit
JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
@ -241,6 +294,7 @@ namespace SwrJit
BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, false);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
@ -253,6 +307,7 @@ namespace SwrJit
JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(BasePtr, usage);
TrackerHelper(BasePtr, Ty, usage, false);
BasePtr = TranslationHelper(BasePtr, Ty);
return Builder::STORE(Val, BasePtr, offset, Ty, usage);
@ -263,6 +318,8 @@ namespace SwrJit
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, false);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage);
}

View File

@ -110,7 +110,7 @@ namespace SwrJit
Type* PtrTy = nullptr,
const Twine& Name = "",
JIT_MEM_CLIENT usage = JIT_MEM_CLIENT::MEM_CLIENT_INTERNAL);
protected:
void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage);
@ -120,6 +120,8 @@ namespace SwrJit
virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
Value* TranslationHelper(Value* Ptr, Type* Ty);
void TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead);
FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; }
Value* GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; }
@ -127,10 +129,14 @@ namespace SwrJit
Value* GetParamSimDC() { return mpParamSimDC; }
Value* mpWorkerData;
private:
FunctionType* mpTranslationFuncTy;
Value* mpfnTranslateGfxAddressForRead;
Value* mpfnTranslateGfxAddressForWrite;
Value* mpParamSimDC;
FunctionType* mpTrackMemAccessFuncTy;
Value* mpfnTrackMemAccess;
};
} // namespace SwrJit

View File

@ -113,7 +113,6 @@ struct FetchJit : public BuilderGfxMem
SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
Value* mpWorkerData;
Value* mpFetchInfo;
};
@ -141,6 +140,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
mpWorkerData = &*argitr;
++argitr;
mpWorkerData->setName("pWorkerData");
mpFetchInfo = &*argitr;
++argitr;
mpFetchInfo->setName("fetchInfo");

View File

@ -263,12 +263,10 @@ struct StreamOutJit : public BuilderGfxMem
std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << ComputeCRC(0, &state, sizeof(state));
Type* typeParam0;
typeParam0 = mInt8PtrTy;
std::vector<Type*> args{
typeParam0,
PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
mInt8PtrTy,
mInt8PtrTy,
PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
};
FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
@ -290,6 +288,10 @@ struct StreamOutJit : public BuilderGfxMem
privateContext->setName("privateContext");
SetPrivateContext(privateContext);
mpWorkerData = &*argitr;
++argitr;
mpWorkerData->setName("pWorkerData");
Value* pSoCtx = &*argitr++;
pSoCtx->setName("pSoCtx");