swr: [rasterizer core] Implement SIMD16 GS and STREAMOUT

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Tim Rowley 2017-02-16 13:50:21 -08:00
parent 5830a0a6f8
commit b228d2db18
1 changed files with 251 additions and 51 deletions

View File

@ -587,7 +587,11 @@ static INLINE uint32_t GetNumInvocations(
uint32_t maxIndex)
{
uint32_t remainder = (maxIndex - curIndex);
#if USE_SIMD16_FRONTEND
return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
#else
return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
#endif
}
//////////////////////////////////////////////////////////////////////////
@ -636,6 +640,51 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
THREAD SWR_GS_CONTEXT tlsGsContext;
template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
struct GsBufferInfo
{
GsBufferInfo(const SWR_GS_STATE &gsState)
{
#if USE_SIMD16_FRONTEND
// TEMPORARY: pad up to multiple of two, to support in-place conversion from simdvertex to simd16vertex
const uint32_t vertexCount = (gsState.maxNumVerts + 1) & ~1;
#else
const uint32_t vertexCount = gsState.maxNumVerts;
#endif
const uint32_t vertexStride = sizeof(SIMDVERTEX);
const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
vertexPrimitiveStride = vertexStride * numSimdBatches;
vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
if (gsState.isSingleStream)
{
cutPrimitiveStride = (vertexCount + 7) / 8;
cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
streamCutPrimitiveStride = 0;
streamCutInstanceStride = 0;
}
else
{
cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
streamCutPrimitiveStride = (vertexCount + 7) / 8;
streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
}
}
uint32_t vertexPrimitiveStride;
uint32_t vertexInstanceStride;
uint32_t cutPrimitiveStride;
uint32_t cutInstanceStride;
uint32_t streamCutPrimitiveStride;
uint32_t streamCutInstanceStride;
};
//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
/// @param pDC - pointer to draw context.
@ -653,6 +702,9 @@ static void GeometryShaderStage(
void* pCutBuffer,
void* pStreamCutBuffer,
uint32_t* pSoPrimData,
#if USE_SIMD16_FRONTEND
uint32_t numPrims,
#endif
simdscalari primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
@ -691,27 +743,15 @@ static void GeometryShaderStage(
tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
}
const uint32_t vertexStride = sizeof(simdvertex);
const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
const uint32_t inputPrimStride = numSimdBatches * vertexStride;
const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
uint32_t cutPrimStride;
uint32_t cutInstanceStride;
if (pState->isSingleStream)
{
cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
}
else
{
cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
}
const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
// record valid prims from the frontend to avoid over binning the newly generated
// prims from the GS
#if USE_SIMD16_FRONTEND
uint32_t numInputPrims = numPrims;
#else
uint32_t numInputPrims = pa.NumPrims();
#endif
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
@ -721,8 +761,8 @@ static void GeometryShaderStage(
// execute the geometry shader
state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
tlsGsContext.pStream += instanceStride;
tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
}
// set up new binner and state for the GS output topology
@ -747,8 +787,9 @@ static void GeometryShaderStage(
uint32_t totalPrimsGenerated = 0;
for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
{
uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
uint32_t numEmittedVerts = pVertexCount[inputPrim];
@ -757,8 +798,8 @@ static void GeometryShaderStage(
continue;
}
uint8_t* pBase = pInstanceBase + instance * instanceStride;
uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
uint32_t numAttribs = state.feNumAttributes;
@ -790,14 +831,86 @@ static void GeometryShaderStage(
processCutVerts = false;
}
#if USE_SIMD16_FRONTEND
// TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex, in-place
{
const simdvertex * source = reinterpret_cast<simdvertex *>(pBase);
simd16vertex * dest = reinterpret_cast<simd16vertex *>(pBase);
simd16vertex temp;
const uint32_t numEmitted = numEmittedVerts & ~1; // round down to multiple of two
for (uint32_t i = 0; i < numEmitted; i += 2) // do two at a time..
{
const uint32_t numAttribs = VERTEX_ATTRIB_START_SLOT + pState->numInputAttribs;
for (uint32_t j = 0; j < numAttribs; j += 1)
{
for (uint32_t k = 0; k < 4; k += 1)
{
temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), source[i].attrib[j][k], 0); // lo
temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], source[i + 1].attrib[j][k], 1); // hi
}
}
#if 1
for (uint32_t j = 0; j < numAttribs; j += 1)
{
dest[i >> 1].attrib[j] = temp.attrib[j];
}
#else
dest[i >> 1]= temp;
#endif
}
if (numEmittedVerts & 1) // do the odd last one
{
const uint32_t numAttribs = VERTEX_ATTRIB_START_SLOT + pState->numInputAttribs;
for (uint32_t j = 0; j < numAttribs; j += 1)
{
for (uint32_t k = 0; k < 4; k += 1)
{
temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), source[numEmittedVerts - 1].attrib[j][k], 0); // lo
}
}
#if 1
for (uint32_t j = 0; j < numAttribs; j += 1)
{
dest[numEmittedVerts >> 1].attrib[j] = temp.attrib[j];
}
#else
dest[numEmittedVerts >> 1] = temp;
#endif
}
}
#endif
#if USE_SIMD16_FRONTEND
PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
#else
PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
#endif
while (gsPa.GetNextStreamOutput())
{
do
{
#if USE_SIMD16_FRONTEND
simd16vector attrib_simd16[3]; // MAX_ATTRIBUTES ??
bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
#else
bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
#endif
if (assemble)
{
totalPrimsGenerated += gsPa.NumPrims();
@ -809,6 +922,73 @@ static void GeometryShaderStage(
if (HasRastT::value && state.soState.streamToRasterizer == stream)
{
#if USE_SIMD16_FRONTEND
simd16scalari vPrimId;
// pull primitiveID from the GS output if available
if (state.gsState.emitsPrimitiveID)
{
simd16vector primIdAttrib[3];
gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
}
else
{
vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
}
// use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
simd16scalari vViewPortIdx;
if (state.gsState.emitsViewportArrayIndex)
{
simd16vector vpiAttrib[3];
gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
// OOB indices => forced to zero.
simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
}
else
{
vViewPortIdx = _simd16_set1_epi32(0);
}
const uint32_t primMask = GenMask(gsPa.NumPrims());
const uint32_t primMask_lo = primMask & 255;
const uint32_t primMask_hi = (primMask >> 8) & 255;
const simd16scalari primID = vPrimId;
const simdscalari primID_lo = _simd16_extract_si(primID, 0);
const simdscalari primID_hi = _simd16_extract_si(primID, 1);
for (uint32_t i = 0; i < 3; i += 1)
{
for (uint32_t j = 0; j < 4; j += 1)
{
attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
}
}
gsPa.useAlternateOffset = false;
pfnClipFunc(pDC, gsPa, workerId, attrib, primMask_lo, primID_lo, _simd16_extract_si(vViewPortIdx, 0));
if (primMask_hi)
{
for (uint32_t i = 0; i < 3; i += 1)
{
for (uint32_t j = 0; j < 4; j += 1)
{
attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 1);
}
}
gsPa.useAlternateOffset = true;
pfnClipFunc(pDC, gsPa, workerId, attrib, primMask_hi, primID_hi, _simd16_extract_si(vViewPortIdx, 1));
}
#else
simdscalari vPrimId;
// pull primitiveID from the GS output if available
if (state.gsState.emitsPrimitiveID)
@ -842,6 +1022,7 @@ static void GeometryShaderStage(
}
pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
#endif
}
}
} while (gsPa.NextPrim());
@ -853,7 +1034,7 @@ static void GeometryShaderStage(
// update GS pipeline stats
UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
AR_END(FEGeometryShader, 1);
}
@ -863,24 +1044,23 @@ static void GeometryShaderStage(
/// @param state - API state
/// @param ppGsOut - pointer to GS output buffer allocation
/// @param ppCutBuffer - pointer to GS output cut buffer allocation
template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
void **ppStreamCutBuffer)
{
auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
SWR_ASSERT(state.gsState.gsEnable);
// allocate arena space to hold GS output verts
// @todo pack attribs
// @todo support multiple streams
const uint32_t vertexStride = sizeof(simdvertex);
const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
*ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
*ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
// allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
// maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
@ -888,15 +1068,19 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state,
// allocate space for temporary per-stream cut buffer if multi-stream is enabled
if (state.gsState.isSingleStream)
{
*ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
*ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
*ppStreamCutBuffer = nullptr;
}
else
{
*ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
*ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
}
const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
*ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
*ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
}
}
//////////////////////////////////////////////////////////////////////////
@ -1203,7 +1387,7 @@ void ProcessDraw(
void* pStreamCutBuffer = nullptr;
if (HasGeometryShaderT::value)
{
AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
}
if (HasTessellationT::value)
@ -1409,7 +1593,32 @@ void ProcessDraw(
{
UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
#if 0
const uint32_t numPrims = pa.NumPrims();
const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
const uint32_t primMask = GenMask(numPrims);
const uint32_t primMask_lo = primMask & 255;
const uint32_t primMask_hi = (primMask >> 8) & 255;
const simd16scalari primID = pa.GetPrimID(work.startPrimID);
const simdscalari primID_lo = _simd16_extract_si(primID, 0);
const simdscalari primID_hi = _simd16_extract_si(primID, 1);
#if 1
if (HasGeometryShaderT::value)
{
pa.useAlternateOffset = false;
GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
if (numPrims_hi)
{
pa.useAlternateOffset = true;
GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
}
}
else
#else
if (HasTessellationT::value)
{
TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
@ -1423,26 +1632,17 @@ void ProcessDraw(
else
#endif
{
#if 0
// If streamout is enabled then stream vertices out to memory.
if (HasStreamOutT::value)
{
pa.useAlternateOffset = false; // StreamOut() is SIMD16-compatible..
StreamOut(pDC, pa, workerId, pSoPrimData, 0);
}
#endif
if (HasRastT::value)
{
SWR_ASSERT(pDC->pState->pfnProcessPrims);
uint32_t mask = GenMask(pa.NumPrims());
uint32_t mask_lo = mask & 255;
uint32_t mask_hi = (mask >> 8) & 255;
simd16scalari primid = pa.GetPrimID(work.startPrimID);
simdscalari primid_lo = primid.lo;
simdscalari primid_hi = primid.hi;
simdvector prim[MAX_NUM_VERTS_PER_PRIM];
for (uint32_t i = 0; i < 3; i += 1)
@ -1454,9 +1654,9 @@ void ProcessDraw(
}
pa.useAlternateOffset = false;
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_lo, primid_lo, _simd_setzero_si());
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, primMask_lo, primID_lo, _simd_setzero_si());
if (mask_hi)
if (primMask_hi)
{
for (uint32_t i = 0; i < 3; i += 1)
{
@ -1467,7 +1667,7 @@ void ProcessDraw(
}
pa.useAlternateOffset = true;
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_hi, primid_hi, _simd_setzero_si());
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, primMask_hi, primID_hi, _simd_setzero_si());
}
}
}